diff --git a/comps/llms/deployment/docker_compose/compose_text-generation.yaml b/comps/llms/deployment/docker_compose/compose_text-generation.yaml index fbf503ed62..1bb58d0c17 100644 --- a/comps/llms/deployment/docker_compose/compose_text-generation.yaml +++ b/comps/llms/deployment/docker_compose/compose_text-generation.yaml @@ -5,6 +5,8 @@ include: - ../../../third_parties/tgi/deployment/docker_compose/compose.yaml - ../../../third_parties/vllm/deployment/docker_compose/compose.yaml - ../../../third_parties/ollama/deployment/docker_compose/compose.yaml + - ../../../third_parties/llamacpp/deployment/docker_compose/compose.yaml + services: textgen: @@ -100,6 +102,16 @@ services: environment: LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative} + textgen-service-llamacpp: + extends: textgen + container_name: textgen-service-llamacpp + environment: + LLM_ENDPOINT: http://llamacpp-server + LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenService} + depends_on: + llamacpp-server: + condition: service_healthy + networks: default: driver: bridge diff --git a/comps/llms/src/text-generation/README_llamacpp.md b/comps/llms/src/text-generation/README_llamacpp.md new file mode 100644 index 0000000000..237d515f2c --- /dev/null +++ b/comps/llms/src/text-generation/README_llamacpp.md @@ -0,0 +1,83 @@ +# llama.cpp Introduction + +[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud". + +This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices. + +llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU. + +To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly. + +## Get Started + +### 1. Download a gguf model to serve + +To download an example .gguf model to a model path: + +```bash +export MODEL_PATH=~/models +mkdir -p $MODEL_PATH # -p means make only if doesn't exist +cd $MODEL_PATH +wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf +``` + +### 2. Set Environment Variables + +```bash +export MODEL_PATH=~/models +export host_ip=$(hostname -I | awk '{print $1}') +export TEXTGEN_PORT=9000 +export LLM_ENDPOINT_PORT=8008 +export LLM_ENDPOINT="http://${host_ip}:80" +export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf" +export LLAMA_ARG_CTX_SIZE=4096 +``` + +### 3. Run the llama.cpp OPEA Microservice + +```bash +export service_name="textgen-service-llamacpp" +cd comps/llms/deployment/docker_compose/ +docker compose -f compose_text-generation.yaml up ${service_name} -d +``` + +The server output can be observed in a terminal with `docker log `. + +## Consume the Service + +Verify the backend llama.cpp backend server: + +```bash +curl http://0.0.0.0:8008/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer no-key" \ + -d '{ + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "What is deep learning?" + } + ] + }' +``` + +Consume the service: + +This component is based on openAI API convention: + +```bash +curl -X POST http://localhost:9000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "Write a limerick about python exceptions"}], + "max_tokens": 100, + "temperature": 0.7, + "top_p": 0.9, + "top_k": 50, + "stream": false + }' +``` diff --git a/comps/third_parties/llamacpp/README.md b/comps/third_parties/llamacpp/README.md new file mode 100644 index 0000000000..3f051ca32d --- /dev/null +++ b/comps/third_parties/llamacpp/README.md @@ -0,0 +1,55 @@ +# Introduction + +[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud". + +This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices. + +llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU. + +To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly. + +## Get Started + +### 1. Download a gguf Model + +To download an example .gguf model to a model path: + +```bash +export MODEL_PATH=~/models +mkdir -p $MODEL_PATH # -p means make only if doesn't exist +cd $MODEL_PATH + +wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf +``` + +### 2. Set Environment Variables + +```bash +export MODEL_PATH=~/models +export host_ip=$(hostname -I | awk '{print $1}') +export LLM_ENDPOINT_PORT=8008 +export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf" +export LLAMA_ARG_CTX_SIZE=4096 +``` + +### 3. Run the llama.cpp Backend Microservice + +```bash +cd deployment/docker_compose +docker compose -f compose.yaml up llamacpp-server -d +``` + +To use this in an OPEA text generation component please see [llama.cpp text-generation](../../llms/src/text-generation/README_llamacpp.md) + +Note: can use docker logs to observe server. + +## Consume the service + +Llama cpp supports openai style API: + +```bash +curl http://${host_ip}:8008/v1/chat/completions \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' +``` diff --git a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml new file mode 100644 index 0000000000..c352db8e39 --- /dev/null +++ b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml @@ -0,0 +1,37 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + llamacpp-server: + image: ghcr.io/ggerganov/llama.cpp:server-b4419 + container_name: llamacpp-server + ports: + - ${LLM_ENDPOINT_PORT:-8008}:80 + volumes: + # Download the .gguf models to this path. + - ${MODEL_PATH:-~/models}:/models + environment: + LOGFLAG: False + no_proxy: ${no_proxy} + https_proxy: ${http_proxy} + http_proxy: ${https_proxy} + LLM_MODEL_ID: ${LLM_MODEL_ID} + LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT} + host_ip: ${host_ip} + # llama.cpp env variables. Please refer to reference: + # https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md + LLAMA_ARG_PORT: 80 + LLAMA_ARG_MODEL: /$LLM_MODEL_ID + LLAMA_ARG_CTX_SIZE: ${LLAMA_ARG_CTX_SIZE:-4096} + LLAMA_ARG_N_PARALLEL: 2 + LLAMA_ARG_ENDPOINT_METRICS: 1 + ipc: host + healthcheck: + test: [ "CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1" ] + interval: 10s + timeout: 10s + retries: 100 + +networks: + default: + driver: bridge diff --git a/tests/llms/test_llms_doc-summarization_tgi.sh b/tests/llms/test_llms_doc-summarization_tgi.sh index 18f9b0da86..28dea550e7 100644 --- a/tests/llms/test_llms_doc-summarization_tgi.sh +++ b/tests/llms/test_llms_doc-summarization_tgi.sh @@ -140,10 +140,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh index bc6cd03b0f..ff75cd0528 100644 --- a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh +++ b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh @@ -141,10 +141,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_doc-summarization_vllm.sh b/tests/llms/test_llms_doc-summarization_vllm.sh index 88cbef9c92..78ef2dca69 100644 --- a/tests/llms/test_llms_doc-summarization_vllm.sh +++ b/tests/llms/test_llms_doc-summarization_vllm.sh @@ -155,10 +155,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh index f2abdb607b..9cc1775251 100644 --- a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh +++ b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh @@ -158,10 +158,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_faq-generation_tgi.sh b/tests/llms/test_llms_faq-generation_tgi.sh index d0ae7aa95c..b95389a273 100644 --- a/tests/llms/test_llms_faq-generation_tgi.sh +++ b/tests/llms/test_llms_faq-generation_tgi.sh @@ -102,10 +102,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_faq-generation.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh b/tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh index 752c340c55..160c6f402f 100644 --- a/tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh +++ b/tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh @@ -111,10 +111,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_faq-generation.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_faq-generation_vllm.sh b/tests/llms/test_llms_faq-generation_vllm.sh index 588ed4981a..43b7b1c65f 100644 --- a/tests/llms/test_llms_faq-generation_vllm.sh +++ b/tests/llms/test_llms_faq-generation_vllm.sh @@ -118,10 +118,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_faq-generation.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_faq-generation_vllm_on_intel_hpu.sh b/tests/llms/test_llms_faq-generation_vllm_on_intel_hpu.sh index b19655444e..12ad1f3091 100644 --- a/tests/llms/test_llms_faq-generation_vllm_on_intel_hpu.sh +++ b/tests/llms/test_llms_faq-generation_vllm_on_intel_hpu.sh @@ -121,8 +121,11 @@ function stop_docker() { } function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a build_docker_images start_service diff --git a/tests/llms/test_llms_text-generation_native_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_native_on_intel_hpu.sh index 0d39a86905..d348d26738 100644 --- a/tests/llms/test_llms_text-generation_native_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_native_on_intel_hpu.sh @@ -87,10 +87,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service validate_microservice diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh new file mode 100644 index 0000000000..1f2f4fcf16 --- /dev/null +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# Copyright (C) 2024 Prediction Guard, Inc. +# SPDX-License-Identifier: Apache-2.0 + +set -x + +IMAGE_REPO=${IMAGE_REPO:-"opea"} +export REGISTRY=${IMAGE_REPO} +export TAG="comps" +echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" +echo "TAG=${TAG}" + +WORKPATH=$(dirname "$PWD") # Assumes the script is called from GenAIComps/comps +host_ip=$(hostname -I | awk '{print $1}') # Adjust to a more reliable command +if [ -z "$host_ip" ]; then + host_ip="localhost" # Default to localhost if IP address is empty +fi +LOG_PATH="$WORKPATH/tests" +service_name="textgen-service-llamacpp" + + +function build_docker_images() { + cd $WORKPATH + docker build --no-cache -t ${REGISTRY:-opea}/llm-textgen:${TAG:-latest} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/llm-textgen built fail" + exit 1 + else + echo "opea/llm-textgen built successful" + fi +} + +function start_service() { + export host_ip=${host_ip} # must be an environment variable declared in scope of start_service + export LLM_ENDPOINT_PORT=8008 + export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" + export TEXTGEN_PORT=9000 + export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf" + export LLAMA_ARG_CTX_SIZE=4096 + export LOGFLAG=True + + export MODEL_PATH=~/models + mkdir -p $MODEL_PATH + cd $MODEL_PATH + wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf \ + -q --show-progress --progress=bar + + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log + docker ps -a + docker logs llamacpp-server + sleep 30s # Allow the service to start +} + +function validate_microservice() { + result=$(http_proxy="" curl -X POST http://${host_ip}:${TEXTGEN_PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "What is AI?"}], + "max_tokens": 100, + "temperature": 0.7, + "top_p": 0.9, + "top_k": 50, + "stream": false + }') + + if [[ $result == *"content"* ]]; then + echo "Service response is correct." + else + echo "Result wrong. Received was $result" + docker logs ${service_name} + exit 1 + fi +} + +function stop_docker() { + cd $WORKPATH/comps/llms/deployment/docker_compose + # Using down without particular service_name since still can have orphan containers that aren't taken down from other tests. + docker compose -f compose_text-generation.yaml down --remove-orphans +} + +function main() { + + echo "Docker containers before stop_docker" + docker ps -a + stop_docker + echo "Docker containers after stop_docker" + docker ps -a + + stop_docker + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune +} + +main +set +x diff --git a/tests/llms/test_llms_text-generation_service_ollama.sh b/tests/llms/test_llms_text-generation_service_ollama.sh index d5087ce7ec..dbf638e130 100644 --- a/tests/llms/test_llms_text-generation_service_ollama.sh +++ b/tests/llms/test_llms_text-generation_service_ollama.sh @@ -69,7 +69,12 @@ function stop_docker() { function main() { + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images llm_models=( diff --git a/tests/llms/test_llms_text-generation_service_tgi.sh b/tests/llms/test_llms_text-generation_service_tgi.sh index c604470257..0e691c65fd 100644 --- a/tests/llms/test_llms_text-generation_service_tgi.sh +++ b/tests/llms/test_llms_text-generation_service_tgi.sh @@ -118,7 +118,11 @@ function stop_docker() { function main() { + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a build_docker_images pip install --no-cache-dir openai pydantic diff --git a/tests/llms/test_llms_text-generation_service_tgi_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_service_tgi_on_intel_hpu.sh index c91a51498b..efa3809b89 100644 --- a/tests/llms/test_llms_text-generation_service_tgi_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_service_tgi_on_intel_hpu.sh @@ -119,7 +119,11 @@ function stop_docker() { function main() { + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a build_docker_images pip install --no-cache-dir openai pydantic diff --git a/tests/llms/test_llms_text-generation_service_vllm_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_service_vllm_on_intel_hpu.sh index ffee44c3d7..0ed3c37f53 100644 --- a/tests/llms/test_llms_text-generation_service_vllm_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_service_vllm_on_intel_hpu.sh @@ -131,10 +131,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images pip install --no-cache-dir openai pydantic