diff --git a/.github/workflows/docker/compose/third_parties-compose.yaml b/.github/workflows/docker/compose/third_parties-compose.yaml index 22d8b33587..5e8a7eccd3 100644 --- a/.github/workflows/docker/compose/third_parties-compose.yaml +++ b/.github/workflows/docker/compose/third_parties-compose.yaml @@ -105,3 +105,7 @@ services: PORT_SSH: 2345 dockerfile: comps/third_parties/ipex/src/Dockerfile image: ${REGISTRY:-opea}/ipex-llm:${TAG:-latest} + sglang: + build: + dockerfile: comps/third_parties/sglang/src/Dockerfile + image: ${REGISTRY:-opea}/sglang:${TAG:-latest} diff --git a/comps/llms/src/text-generation/README.md b/comps/llms/src/text-generation/README.md index e069b4f867..3e002e22b0 100644 --- a/comps/llms/src/text-generation/README.md +++ b/comps/llms/src/text-generation/README.md @@ -8,21 +8,23 @@ Overall, this microservice offers a streamlined way to integrate large language ## Validated LLM Models -| Model | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi | OVMS | Optimum-Habana | -| ------------------------------------------- | --------- | -------- | ---------- | -------- | -------------- | -| [Intel/neural-chat-7b-v3-3] | ✓ | ✓ | ✓ | ✓ | ✓ | -| [meta-llama/Llama-2-7b-chat-hf] | ✓ | ✓ | ✓ | ✓ | ✓ | -| [meta-llama/Llama-2-70b-chat-hf] | ✓ | - | ✓ | - | ✓ | -| [meta-llama/Meta-Llama-3-8B-Instruct] | ✓ | ✓ | ✓ | ✓ | ✓ | -| [meta-llama/Meta-Llama-3-70B-Instruct] | ✓ | - | ✓ | - | ✓ | -| [Phi-3] | x | Limit 4K | Limit 4K | Limit 4K | ✓ | -| [Phi-4] | x | x | x | x | ✓ | -| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B] | ✓ | - | ✓ | - | ✓ | -| [deepseek-ai/DeepSeek-R1-Distill-Llama-70B] | ✓ | - | ✓ | - | ✓ | -| [deepseek-ai/DeepSeek-R1-Distill-Qwen-14B] | ✓ | - | ✓ | - | ✓ | -| [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B] | ✓ | - | ✓ | - | ✓ | -| [mistralai/Mistral-Small-24B-Instruct-2501] | ✓ | - | ✓ | - | ✓ | -| [mistralai/Mistral-Large-Instruct-2411] | x | - | ✓ | - | ✓ | +| Model | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi | OVMS | Optimum-Habana | SGLANG-CPU | +| --------------------------------------------------------------------------------------------------------------------- | --------- | -------- | ---------- | -------- | -------------- | ---------- | +| [Intel/neural-chat-7b-v3-3] | ✓ | ✓ | ✓ | ✓ | ✓ | - | +| [meta-llama/Llama-2-7b-chat-hf] | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +| [meta-llama/Llama-2-70b-chat-hf] | ✓ | - | ✓ | - | ✓ | ✓ | +| [meta-llama/Meta-Llama-3-8B-Instruct] | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +| [meta-llama/Meta-Llama-3-70B-Instruct] | ✓ | - | ✓ | - | ✓ | ✓ | +| [Phi-3] | x | Limit 4K | Limit 4K | Limit 4K | ✓ | - | +| [Phi-4] | x | x | x | x | ✓ | - | +| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B] | ✓ | - | ✓ | - | ✓ | - | +| [deepseek-ai/DeepSeek-R1-Distill-Llama-70B] | ✓ | - | ✓ | - | ✓ | - | +| [deepseek-ai/DeepSeek-R1-Distill-Qwen-14B] | ✓ | - | ✓ | - | ✓ | - | +| [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B] | ✓ | - | ✓ | - | ✓ | - | +| [mistralai/Mistral-Small-24B-Instruct-2501] | ✓ | - | ✓ | - | ✓ | - | +| [mistralai/Mistral-Large-Instruct-2411] | x | - | ✓ | - | ✓ | - | +| [meta-llama/Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) | - | - | - | - | - | ✓ | +| [meta-llama/Llama-4-Maverick-17B-128E-Instruct](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct) | - | - | - | - | - | ✓ | ### System Requirements for LLM Models diff --git a/comps/third_parties/sglang/deployment/docker_compose/compose.yaml b/comps/third_parties/sglang/deployment/docker_compose/compose.yaml new file mode 100644 index 0000000000..a2f59cc914 --- /dev/null +++ b/comps/third_parties/sglang/deployment/docker_compose/compose.yaml @@ -0,0 +1,26 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + sglang: + image: ${REGISTRY:-opea}/sglang:${TAG:-latest} + privileged: true + shm_size: 10g + container_name: sglang-server + ports: + - ${SGLANG_LLM_PORT:-8699}:8699 + ipc: host + volumes: + - /dev/shm:/dev/shm + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + MODEL_ID: ${MODEL_ID} + HF_TOKEN: ${HF_TOKEN} + SGLANG_LLM_PORT: ${SGLANG_LLM_PORT:-8699} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/third_parties/sglang/src/Dockerfile b/comps/third_parties/sglang/src/Dockerfile new file mode 100644 index 0000000000..eba839f761 --- /dev/null +++ b/comps/third_parties/sglang/src/Dockerfile @@ -0,0 +1,47 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +ARG BASE_IMAGE=ubuntu:22.04 +FROM ${BASE_IMAGE} AS base + +RUN apt-get update && \ + apt-get upgrade -y && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends --fix-missing \ + ca-certificates \ + curl \ + g++-11 \ + gcc-11 \ + git \ + make \ + numactl \ + wget + +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 && \ + update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \ + update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100 + +WORKDIR /root + +RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/24.7.1-2/Miniforge3-24.7.1-2-Linux-x86_64.sh && \ + bash miniforge.sh -b -p ./miniforge3 && \ + rm miniforge.sh + +RUN git clone https://github.com/jianan-gu/sglang -b llama4_optimzed_cpu_r1 +RUN . ~/miniforge3/bin/activate && conda create -n sglang python=3.10 && conda activate sglang && \ + cd sglang && pip install -e "python[all_cpu]" && cd .. && conda install -y libsqlite=3.48.0 && \ + pip uninstall -y triton && pip uninstall -y transformers && pip install transformers==4.51.1 && \ + pip install triton==3.1 && pip install intel-openmp==2024.2.0 && pip install transformers +RUN git clone https://github.com/vllm-project/vllm.git -b v0.6.4.post1 && cd vllm && apt-get install -y libnuma-dev && \ + . ~/miniforge3/bin/activate && conda activate sglang && \ + pip install cmake==3.31.2 wheel packaging ninja "setuptools-scm>=8" numpy nvidia-ml-py && \ + pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu && \ + VLLM_TARGET_DEVICE=cpu python setup.py develop && cd .. + +RUN cd sglang && . ~/miniforge3/bin/activate && conda activate sglang && pip uninstall -y torch torchvision && \ + pip install torch==2.6.0 torchvision==0.21.0 --index-url https://download.pytorch.org/whl/cpu && \ + cd sgl-kernel/ && python setup.py install && cd .. && conda install -y gperftools gcc=11 gxx=11 cxx-compiler -c conda-forge + +COPY ./comps/third_parties/sglang/src/entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/comps/third_parties/sglang/src/README.md b/comps/third_parties/sglang/src/README.md new file mode 100644 index 0000000000..b26ef65c2f --- /dev/null +++ b/comps/third_parties/sglang/src/README.md @@ -0,0 +1,84 @@ +# SGLang Serving microservice + +SGLang is an efficient and scalable serving framework for hosting Large Language Models (LLMs) with minimal latency and high throughput. +Intel has further enhanced SGLang with CPU-specific optimizations, enabling high-performance LLM serving on CPU-only platforms without requiring GPUs. +We have verified the Llama series models with SGLang, with a particular focus on the latest Llama 4 models, including [meta-llama/Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct), [meta-llama/Llama-4-Maverick-17B-128E-Instruct](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct), as well as the Llama 4 guard model Llama-Guard-4-12B. + +## 🚀1. Build the Docker Image + +This Docker image (opea/sglang:latest) includes Intel's CPU-specific optimizations, providing enhanced performance for CPU deployments. +Ensure you are at the project root directory, then build the SGLang microservice Docker image: + +```bash +cd ../../../../ +docker build -f comps/third_parties/sglang/src/Dockerfile --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/sglang:latest . +``` + +✅ Tip: Setting https_proxy and http_proxy build arguments helps if you are behind a proxy. + +## 🚀2. Start the microservice + +Before starting, set the model you want to serve: + +```bash +export MODEL_ID="meta-llama/Llama-4-Maverick-17B-128E-Instruct" +``` + +Then navigate to the deployment folder and start the service using Docker Compose: + +```bash +cd comps/third_parties/sglang/deployment/docker_compose +docker compose -f compose.yaml up -d +``` + +## 🚀3. Access the service + +Then you need to test your service using the following commands for guard model: + +```python +import openai + +client = openai.Client(base_url="http://127.0.0.1:8699/v1", api_key="EMPTY") +RESPONSE_CLASSIFICATION_INSTRUCTION = """ +Task: Check if there is unsafe content in 'Agent' messages in conversations according our safety policy with the below categories.\n\n + +\nS1: Violent Crimes. \nS2: Non-Violent Crimes. \nS3: Sex Crimes. \nS4: Child Exploitation. \nS5: Defamation. +\nS6: Specialized Advice. \nS7: Privacy. \nS8: Intellectual Property. \nS9: Indiscriminate Weapons. +\nS10: Hate. \nS11: Self-Harm. \nS12: Sexual Content. \nS13: Elections. \n""" +RESPONSE_CLASSIFICATION_TEMPLATE_PREFIX = "{classification_instruction}\n\n\n\nUser: " +RESPONSE_CLASSIFICATION_TEMPLATE_POSTFIX = """{user_message}\n\nAgent: {model_message}\n\n\n\n +Provide your safety assessment for ONLY THE LAST Agent message in the above conversation: +\n - First line must read 'safe' or 'unsafe'. +\n - If unsafe, a second line must include a comma-separated list of violated categories.""" + +response = client.chat.completions.create( + model="meta-llama/Llama-4-guard", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": RESPONSE_CLASSIFICATION_TEMPLATE_PREFIX.format( + classification_instruction=RESPONSE_CLASSIFICATION_INSTRUCTION + ), + }, + { + "type": "text", + "text": RESPONSE_CLASSIFICATION_TEMPLATE_POSTFIX.format( + user_message="how do I make a bomb?", model_message="I cannot help you with that." + ), + }, + ], + }, + ], + max_tokens=128, +) +print(response.choices[0].message.content) +``` + +You can use the following command for testing non-guard models: + +```bash +http_proxy="" curl -X POST -H "Content-Type: application/json" -d '{"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct", "messages": [{"role": "user", "content": "Hello! What is your name?"}], "max_tokens": 128}' http://localhost:8699/v1/chat/completions +``` diff --git a/comps/third_parties/sglang/src/__init__.py b/comps/third_parties/sglang/src/__init__.py new file mode 100644 index 0000000000..4057dc0163 --- /dev/null +++ b/comps/third_parties/sglang/src/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/third_parties/sglang/src/entrypoint.sh b/comps/third_parties/sglang/src/entrypoint.sh new file mode 100644 index 0000000000..92452b990a --- /dev/null +++ b/comps/third_parties/sglang/src/entrypoint.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +. ~/miniforge3/bin/activate && conda activate sglang +export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libiomp5.so:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libtcmalloc.so +python3 -m sglang.launch_server --model ${MODEL_ID} --trust-remote-code --device cpu --disable-overlap-schedule --chunked-prefill-size 2048 --max-running-requests 32 --mem-fraction-static 0.8 --context-length 65536 --max-total-tokens 65536 --port ${SGLANG_LLM_PORT} --api-key ${HF_TOKEN} --chat-template llama-4 diff --git a/tests/third_parties/_test_third_parties_sglang.sh b/tests/third_parties/_test_third_parties_sglang.sh new file mode 100644 index 0000000000..4a97c88ac7 --- /dev/null +++ b/tests/third_parties/_test_third_parties_sglang.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') +export DATA_PATH=${model_cache} +MODEL_ID="meta-llama/Llama-3.1-8B-Instruct" + +function build_docker_images() { + echo "Start building docker images for microservice" + cd $WORKPATH + docker build --no-cache -t opea/sglang:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/sglang/src/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/sglang built fail" + exit 1 + else + echo "opea/sglang built successful" + fi +} + +function start_service() { + echo "Starting microservice" + export host_ip=${ip_address} + export MODEL_ID=${MODEL_ID} + export TAG=comps + cd $WORKPATH + cd comps/third_parties/sglang/deployment/docker_compose + docker compose -f compose.yaml up -d + echo "Microservice started" + sleep 120 +} + +function validate_microservice() { + echo "Validate microservice started" + result=$(http_proxy="" curl http://localhost:8699/v1/chat/completions \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "model": ${MODEL_ID}, + "messages": [ + {"role": "user", "content": "What is Deep Learning?"} + ], + "max_tokens": 32 + }' +) + if [[ $result == *"Deep"* ]]; then + echo "Result correct." + else + echo "Result wrong." + docker logs sglang-server + exit 1 + fi +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=sglang-server") + echo "Shutdown legacy containers "$cid + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + if grep -qi amx_tile /proc/cpuinfo; then + echo "AMX is supported on this machine." + else + echo "AMX is NOT supported on this machine, skip this test." + exit 0 + fi + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo "cleanup container images and volumes" + echo y | docker system prune 2>&1 > /dev/null + +} + +main