opea-project · edlee123 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 28, 2024
@@ -5,6 +5,8 @@ include:
   - ../../../third_parties/tgi/deployment/docker_compose/compose.yaml
   - ../../../third_parties/vllm/deployment/docker_compose/compose.yaml
   - ../../../third_parties/ollama/deployment/docker_compose/compose.yaml
+  - ../../../third_parties/llamacpp/deployment/docker_compose/compose.yaml
+
 
 services:
   textgen:
@@ -100,6 +102,16 @@ services:
     environment:
       LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative}
 
+  textgen-service-llamacpp:
+    extends: textgen
+    container_name: textgen-service-llamacpp
+    environment:
+      LLM_ENDPOINT: http://llamacpp-server
+      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenService}
+    depends_on:
+      llamacpp-server:
+        condition: service_healthy
+
 networks:
   default:
     driver: bridge
@@ -0,0 +1,83 @@
+# llama.cpp Introduction
+
+[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud".
+
+This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices.
+
+llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU.
+
+To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly.
+
+## Get Started
+
+### 1. Download a gguf model to serve
+
+To download an example .gguf model to a model path:
+
+```bash
+export MODEL_PATH=~/models
+mkdir -p $MODEL_PATH # -p means make only if doesn't exist
+cd $MODEL_PATH
+wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf
+```
+
+### 2. Set Environment Variables
+
+```bash
+export MODEL_PATH=~/models
+export host_ip=$(hostname -I | awk '{print $1}')
+export TEXTGEN_PORT=9000
+export LLM_ENDPOINT_PORT=8008
+export LLM_ENDPOINT="http://${host_ip}:80"
+export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf"
+export LLAMA_ARG_CTX_SIZE=4096
+```
+
+### 3. Run the llama.cpp OPEA Microservice
+
+```bash
+export service_name="textgen-service-llamacpp"
+cd comps/llms/deployment/docker_compose/
+docker compose -f compose_text-generation.yaml up ${service_name} -d
+```
+
+The server output can be observed in a terminal with `docker log <container>`.
+
+## Consume the Service
+
+Verify the backend llama.cpp backend server:
+
+```bash
+curl http://0.0.0.0:8008/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer no-key" \
+    -d '{
+    "messages": [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant."
+        },
+        {
+            "role": "user",
+            "content": "What is deep learning?"
+        }
+        ]
+    }'
+```
+
+Consume the service:
+
+This component is based on openAI API convention:
+
+```bash
+curl -X POST http://localhost:9000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "messages": [{"role": "user", "content": "Write a limerick about python exceptions"}],
+        "max_tokens": 100,
+        "temperature": 0.7,
+        "top_p": 0.9,
+        "top_k": 50,
+        "stream": false
+    }'
+```
@@ -0,0 +1,55 @@
+# Introduction
+
+[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud".
+
+This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices.
+
+llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU.
+
+To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly.
+
+## Get Started
+
+### 1. Download a gguf Model
+
+To download an example .gguf model to a model path:
+
+```bash
+export MODEL_PATH=~/models
+mkdir -p $MODEL_PATH # -p means make only if doesn't exist
+cd $MODEL_PATH
+
+wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf
+```
+
+### 2. Set Environment Variables
+
+```bash
+export MODEL_PATH=~/models
+export host_ip=$(hostname -I | awk '{print $1}')
+export LLM_ENDPOINT_PORT=8008
+export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf"
+export LLAMA_ARG_CTX_SIZE=4096
+```
+
+### 3. Run the llama.cpp Backend Microservice
+
+```bash
+cd deployment/docker_compose
+docker compose -f compose.yaml up llamacpp-server -d
+```
+
+To use this in an OPEA text generation component please see [llama.cpp text-generation](../../llms/src/text-generation/README_llamacpp.md)
+
+Note: can use docker logs <container> to observe server.
+
+## Consume the service
+
+Llama cpp supports openai style API:
+
+```bash
+curl http://${host_ip}:8008/v1/chat/completions \
+    -X POST \
+    -H "Content-Type: application/json" \
+    -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
+```
@@ -0,0 +1,37 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  llamacpp-server:
+    image: ghcr.io/ggerganov/llama.cpp:server-b4419
+    container_name: llamacpp-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      # Download the .gguf models to this path.
+      - ${MODEL_PATH:-~/models}:/models
+    environment:
+      LOGFLAG: False
+      no_proxy: ${no_proxy}
+      https_proxy: ${http_proxy}
+      http_proxy: ${https_proxy}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      host_ip: ${host_ip}
+      # llama.cpp env variables. Please refer to reference:
+      # https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md
+      LLAMA_ARG_PORT: 80
+      LLAMA_ARG_MODEL: /$LLM_MODEL_ID
+      LLAMA_ARG_CTX_SIZE: ${LLAMA_ARG_CTX_SIZE:-4096}
+      LLAMA_ARG_N_PARALLEL: 2
+      LLAMA_ARG_ENDPOINT_METRICS: 1
+    ipc: host
+    healthcheck:
+      test: [ "CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1" ]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+
+networks:
+  default:
+    driver: bridge
@@ -140,10 +140,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service

@@ -141,10 +141,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service

@@ -155,10 +155,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service

@@ -158,10 +158,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service

@@ -102,10 +102,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_faq-generation.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service

@@ -111,10 +111,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_faq-generation.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service

@@ -118,10 +118,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_faq-generation.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service

@@ -121,8 +121,11 @@ function stop_docker() {
 }
 
 function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
 
     build_docker_images
     start_service

@@ -87,10 +87,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
     build_docker_images
     start_service
     validate_microservice