opea-project · lvliang-intel · Apr 3, 2025 · Mar 11, 2025 · Mar 21, 2025
@@ -154,7 +154,18 @@ This ensures the system can handle high concurrency by allowing more open files
 **Run the benchmark script**:
 
 ```bash
-python evals/benchmark/benchmark.py
+cd evals/benchmark/
+python benchmark.py
+```
+
+> NOTE: benchmark.py will take benchmark.yaml file as test case data.  
+> By giving a --yaml argument with a custom yaml file, benchmark.py could take a custom test case data.
+
+e.g.
+Test data for the examples starting by docker compose on HPU.  
+```bash
+cd evals/benchmark/
+python benchmark.py --yaml docker.hpu.benchmark.yaml
 ```
 
 Results will be saved in the directory specified by `test_output_dir` in the configuration.

@@ -0,0 +1,56 @@
+# GenAIEval Dockerfiles
+
+Dockerfiles are provided along with related GenAIEval scripts. 
+
+## Gaudi Requirements
+Please make sure to follow [Driver Installation](https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html) to install Gaudi driver on the system.
+To use dockerfile provided for the sample, please follow [Docker Installation](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html) to setup habana runtime for Docker images.
+
+## Run GenAIEval on Gaudi 
+### Docker Build
+To build the image from the Dockerfile for Gaudi, please follow below command to build the opea/genai-eval image.
+```bash
+docker build --no-cache -t opea/genai-eval:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy=$no_proxy -f hpu.dockerfile .
+```
+### Docker Run
+After docker build, users could follow below command to run and docker instance and users will be in the docker instance under text-generation folder.
+```bash
+docker run -it --name opea-eval --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none   --cap-add=ALL --privileged=true -v /var/run/docker.sock://var/run/docker.sock --net=host --ipc=host opea/genai-eval:latest
+```
+
+## Benchmarking OPEA Examples on Intel&reg; Gaudi&reg; AI Processor and Xeon&reg; Processor
+Benchmark script will use different yaml file to run the benchmark on Gaudi or Xeon. 
+### Docker Build
+To build the image from the Dockerfile for OPEA examples benchmarking, please follow below command to build the opea/genai-eval-benchmark image.
+```bash
+docker build --no-cache -t opea/genai-eval-benchmark:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy=$no_proxy -f benchmark.dockerfile .
+```
+### Run a OPEA Example using docker compose
+Follow the OPEA Example docker compose README to run a OPEA example for benchmarking.  
+
+### Docker Run
+After docker build, follow below command to run and docker instance under OPEA example default network.
+#### Xeon
+```bash
+docker run -it --name opea-eval -v /var/run/docker.sock://var/run/docker.sock --net=xeon_default --ipc=host opea/genai-eval-benchmark:latest
+```
+#### Gaudi
+```bash
+docker run -it --name opea-eval -v /var/run/docker.sock://var/run/docker.sock --net=gaudi_default --ipc=host opea/genai-eval-benchmark:latest
+```
+> [!NOTE]
+> The Huggingface model file size might be large, so we recommend to use an external disk as Huggingface hub folder. \
+> Please export HF_HOME environment variable to your external disk and then export the mount point into docker instance. \
+> ex: "-e HF_HOME=/mnt/huggingface -v /mnt:/mnt"  
+> To use Huggingface models, HF_TOKEN needs to be exported as environment variable. \
+> ex: "-e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}"
+
+#### Run the Benchmark
+#### Xeon
+```bash
+python3 benchmark.py --yaml docker.cpu.benchmark.yaml
+```
+#### Gaudi
+```bash
+python3 benchmark.py --yaml docker.hpu.benchmark.yaml
+```
@@ -0,0 +1,27 @@
+#FROM python:3.11-slim AS base
+#ARG BASE_TAG=latest
+#FROM opea/comps-base:$BASE_TAG
+FROM ubuntu:24.04
+
+ENV LANG=en_US.UTF-8
+ARG REPO=https://github.com/intel-ai-tce/GenAIEval.git
+ARG REPO_PATH=""
+ARG BRANCH=fix_req
+
+RUN DEBIAN_FRONTEND=noninteractive \ 
+    apt-get update && \
+    apt-get -y install git python3-pip python3-setuptools
+
+# Download code
+SHELL ["/bin/bash", "--login", "-c"]
+RUN mkdir -p /GenAIEval
+COPY ${REPO_PATH} /GenAIEval
+RUN if [ "$REPO_PATH" == "" ]; then rm -rf /GenAIEval/* && rm -rf /GenAIEval/.* ; git clone --single-branch --branch=${BRANCH} ${REPO} /GenAIEval ; fi
+
+# Build From Source
+RUN cd /GenAIEval && \
+    pip3 install -r requirements.txt  --break-system-packages && \
+    python3 setup.py install && \
+    pip3 list
+
+WORKDIR /GenAIEval/evals/benchmark
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest as hpu
+FROM vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest as hpu
 
 ENV LANG=en_US.UTF-8
 ENV PYTHONPATH=/root:/usr/lib/habanalabs/
@@ -21,7 +21,6 @@ RUN pip install --upgrade pip setuptools==69.5.1
 RUN cd /GenAIEval && \
     pip install -r requirements.txt && \
     python setup.py install && \
-    pip install --upgrade-strategy eager optimum[habana] && \
     pip list
 
 WORKDIR /GenAIEval/
@@ -48,6 +48,13 @@ This command increases the maximum number of file descriptors (which represent o
 python benchmark.py
 ```
 
+> NOTE : Add --yaml argument to use a customized benchmark configurationse.
+
+ e.g. 
+ ```bash
+python benchmark.py --yaml docker.hpu.benchmark.yaml
+```
+
 The results will be stored in the directory specified by `test_output_dir` in the configuration.
 
 

@@ -303,9 +303,10 @@ def check_test_suite_config(test_suite_config):
         raise ValueError("Must specify either run_time or user_queries.")
 
 
-def run_benchmark(report=False):
+def run_benchmark(report=False, yaml=yaml):
     # Load test suit configuration
-    yaml_content = load_yaml("./benchmark.yaml")
+    print(yaml)
+    yaml_content = load_yaml(yaml)
     # Extract data
     parsed_data = extract_test_case_data(yaml_content)
     test_suite_config = {
@@ -372,6 +373,7 @@ def run_benchmark(report=False):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Read and parse JSON/YAML files and output JSON file")
     parser.add_argument("--report", help="Return the perf", action="store_true")
+    parser.add_argument("--yaml", help="Input benchmark yaml file", action="store", default="./benchmark.yaml")
     args = parser.parse_args()
 
-    run_benchmark(report=args.report)
+    run_benchmark(report=args.report, yaml=args.yaml)
@@ -0,0 +1,186 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+test_suite_config: # Overall configuration settings for the test suite
+  examples: ["chatqna"]  # The specific test cases being tested, e.g., chatqna, codegen, codetrans, faqgen, audioqna, visualqna
+  deployment_type: "docker"  # Default is "k8s", can also be "docker"
+  service_ip: "chatqna-xeon-nginx-server"  # Leave as None for k8s, specify for Docker
+  service_port: 80  # Leave as None for k8s, specify for Docker
+  warm_ups: 0  # Number of test requests for warm-up
+  run_time: 60m  # The max total run time for the test suite
+  seed:  # The seed for all RNGs
+  user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]  # Number of test requests at each concurrency level
+  query_timeout: 120  # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult.
+  random_prompt: false  # Use random prompts if true, fixed prompts if false
+  collect_service_metric: true  # Collect service metrics if true, do not collect service metrics if false
+  data_visualization: true # Generate data visualization if true, do not generate data visualization if false
+  llm_model: "meta-llama/Meta-Llama-3-8B-Instruct"  # The LLM model used for the test
+  test_output_dir: "./benchmark_output"  # The directory to store the test output
+  load_shape:              # Tenant concurrency pattern
+    name: constant           # poisson or constant(locust default load shape)
+    params:                  # Loadshape-specific parameters
+      constant:                # Constant load shape specific parameters, activate only if load_shape.name is constant
+        concurrent_level: 4      # If user_queries is specified, concurrent_level is target number of requests per user. If not, it is the number of simulated users
+        # arrival_rate: 1.0       # Request arrival rate. If set, concurrent_level will be overridden, constant load will be generated based on arrival-rate
+      poisson:                 # Poisson load shape specific parameters, activate only if load_shape.name is poisson
+        arrival_rate: 1.0        # Request arrival rate
+  namespace: "" # Fill the user-defined namespace. Otherwise, it will be default.
+
+test_cases:
+  chatqna:
+    embedding:
+      run_test: false
+      service_name: "embedding-svc"  # Replace with your service name
+    embedserve:
+      run_test: false
+      service_name: "embedding-dependency-svc"  # Replace with your service name
+    retriever:
+      run_test: false
+      service_name: "retriever-svc"  # Replace with your service name
+      parameters:
+        search_type: "similarity"
+        k: 1
+        fetch_k: 20
+        lambda_mult: 0.5
+        score_threshold: 0.2
+    reranking:
+      run_test: false
+      service_name: "reranking-svc"  # Replace with your service name
+      parameters:
+        top_n: 1
+    rerankserve:
+      run_test: false
+      service_name: "reranking-dependency-svc"  # Replace with your service name
+    llm:
+      run_test: false
+      service_name: "llm-svc"  # Replace with your service name
+      parameters:
+        max_new_tokens: 128
+        temperature: 0.01
+        top_k: 10
+        top_p: 0.95
+        repetition_penalty: 1.03
+        streaming: true
+    llmserve:
+      run_test: false
+      service_name: "llm-dependency-svc"  # Replace with your service name
+    e2e:
+      run_test: true
+      #service_name: "chatqna-xeon-backend-server"  # Replace with your service name
+      service_name: "chatqna-xeon-nginx-server"  # Replace with your service name
+      service_list:  # Replace with your k8s service names if deploy with k8s
+                     # or container names if deploy with Docker for metrics collection,
+                     # activate if collect_service_metric is true
+        - "chatqna-xeon-backend-server"
+        - "chatqna-xeon-nginx-server"
+        - "dataprep-redis-server"
+        - "tei-embedding-server"
+        - "vllm-service"
+        - "tei-reranking-server"
+        - "retriever-redis-server"
+        - "redis-vector-db"
+      dataset:  # Activate if random_prompt=true: leave blank = default dataset(WebQuestions) or sharegpt
+      prompts: In an increasingly complex world where technology has rapidly advanced and evolved far beyond our wildest dreams, humanity now stands on the precipice of a revolutionary new era that is filled with endless possibilities, profound and significant changes, as well as intricate challenges that we must actively address. The year is now 2050, and artificial intelligence has seamlessly woven itself deeply and intricately into the very fabric of everyday life. Autonomous vehicles glide effortlessly and smoothly through the bustling, vibrant, and lively city streets, while drones swiftly and accurately deliver packages with pinpoint precision, making logistics and delivery systems more efficient, streamlined, and advanced than ever before in the entire history of humankind and technological development. Smart homes, equipped with cutting-edge advanced sensors and sophisticated algorithms, anticipate every possible need and requirement of their inhabitants, creating an environment of unparalleled convenience, exceptional comfort, and remarkable efficiency that enhances our daily lives. However, with these remarkable and groundbreaking advancements come a host of new challenges, uncertainties, and ethical dilemmas that society must confront, navigate, and address in a thoughtful and deliberate manner. As we carefully navigate through this brave new world filled with astonishing technological marvels, innovations, and breakthroughs, questions about the implications and consequences of AI technologies become increasingly pressing, relevant, and urgent for individuals and communities alike. Issues surrounding privacy—how our personal data is collected, securely stored, processed, and utilized—emerge alongside significant concerns about security in a rapidly evolving digital landscape where vulnerabilities can be easily and readily exploited by malicious actors, hackers, and cybercriminals. Moreover, philosophical inquiries regarding the very nature of consciousness itself rise prominently to the forefront of public discourse, debate, and discussion, inviting diverse perspectives, opinions, and ethical considerations from various stakeholders. In light of these profound developments and transformative changes that we are witnessing, I would like to gain a much deeper, broader, and more comprehensive understanding of what artificial intelligence truly is and what it encompasses in its entirety and complexity. Could you elaborate extensively, thoroughly, and comprehensively on its precise definition, its wide-ranging and expansive scope, as well as the myriad and diverse ways it significantly impacts our daily lives, personal experiences, and society as a whole in various dimensions and aspects? # User-customized prompts, activate if random_prompt=false.
+      max_output: 128  # max number of output tokens
+      k: 1 # number of retrieved documents
+
+  codegen:
+    llm:
+      run_test: true
+      service_name: "llm-dependency-svc"  # Replace with your service name
+      parameters:
+        model_name: "Qwen/CodeQwen1.5-7B-Chat"
+        max_new_tokens: 128
+        temperature: 0.01
+        top_k: 10
+        top_p: 0.95
+        repetition_penalty: 1.03
+        streaming: true
+    llmserve:
+      run_test: true
+      service_name: "llm-svc"  # Replace with your service name
+    e2e:
+      run_test: true
+      service_name: "codegen-backend-svc"  # Replace with your service name
+
+  codetrans:
+    llm:
+      run_test: true
+      service_name: "llm-svc"  # Replace with your service name
+      parameters:
+        model_name: "HuggingFaceH4/mistral-7b-grok"
+        max_new_tokens: 128
+        temperature: 0.01
+        top_k: 10
+        top_p: 0.95
+        repetition_penalty: 1.03
+        streaming: true
+    llmserve:
+      run_test: true
+      service_name: "codetrans-llm-svc"  # Replace with your service name
+    e2e:
+      run_test: true
+      service_name: "codetrans-backend-server-svc"  # Replace with your service name
+
+  faqgen:
+    llm:
+      run_test: false
+      service_name: "faq-tgi-svc"  # Replace with your service name
+      parameters:
+        model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
+        max_new_tokens: 128
+        temperature: 0.01
+        top_k: 10
+        top_p: 0.95
+        repetition_penalty: 1.03
+        streaming: true
+    llmserve:
+      run_test: false
+      service_name: "faq-micro-svc"  # Replace with your service name
+    e2e:
+      run_test: true
+      service_name: "faq-mega-server-svc"  # Replace with your service name
+
+  audioqna:
+    asr:
+      run_test: true
+      service_name: "asr-svc"  # Replace with your service name
+    llm:
+      run_test: true
+      service_name: "llm-svc"  # Replace with your service name
+      parameters:
+        model_name: "Intel/neural-chat-7b-v3-3"
+        max_new_tokens: 128
+        temperature: 0.01
+        top_k: 10
+        top_p: 0.95
+        repetition_penalty: 1.03
+        streaming: true
+    llmserve:
+      run_test: true
+      service_name: "llm-svc"  # Replace with your service name
+    tts:
+      run_test: true
+      service_name: "tts-svc"  # Replace with your service name
+    e2e:
+      run_test: true
+      service_name: "audioqna-backend-server-svc"  # Replace with your service name
+
+  visualqna:
+    lvm:
+      run_test: true
+      service_name: "llm-svc"  # Replace with your service name
+      parameters:
+        model_name: "llava-hf/llava-v1.6-mistral-7b-hf"
+        max_new_tokens: 128
+        temperature: 0.01
+        top_k: 10
+        top_p: 0.95
+        repetition_penalty: 1.03
+        streaming: true
+    lvmserve:
+      run_test: true
+      service_name: "lvm-serving-svc"  # Replace with your service name
+    e2e:
+      run_test: true
+      service_name: "visualqna-backend-server-svc"  # Replace with your service name