From b38ea47894b733d93851c20de7aa39f05d7483c5 Mon Sep 17 00:00:00 2001 From: Wenxin Zhang Date: Fri, 7 Mar 2025 11:01:06 +0800 Subject: [PATCH 1/4] add tests Signed-off-by: Wenxin Zhang --- .github/code_spell_ignore.txt | 1 + .github/license_template.txt | 2 + .github/pull_request_template.md | 24 +++ .github/workflows/change_color | 80 +++++++++ .github/workflows/docker/code-scan.dockerfile | 23 +++ .github/workflows/docker/ut.dockerfile | 23 +++ .github/workflows/mix-code-scan.yml | 66 ++++++++ .github/workflows/mix-trellix.yml | 32 ++++ .github/workflows/pr-link-path-scan.yaml | 153 ++++++++++++++++++ .github/workflows/scripts/codeScan/bandit.sh | 22 +++ .../workflows/scripts/codeScan/hadolint.sh | 17 ++ .github/workflows/scripts/codeScan/trellix.sh | 49 ++++++ .pre-commit-config.yaml | 129 +++++++++++++++ pyproject.toml | 109 +++++++++++++ samples/README.md | 2 +- 15 files changed, 731 insertions(+), 1 deletion(-) create mode 100644 .github/code_spell_ignore.txt create mode 100644 .github/license_template.txt create mode 100644 .github/pull_request_template.md create mode 100644 .github/workflows/change_color create mode 100644 .github/workflows/docker/code-scan.dockerfile create mode 100644 .github/workflows/docker/ut.dockerfile create mode 100644 .github/workflows/mix-code-scan.yml create mode 100644 .github/workflows/mix-trellix.yml create mode 100644 .github/workflows/pr-link-path-scan.yaml create mode 100644 .github/workflows/scripts/codeScan/bandit.sh create mode 100644 .github/workflows/scripts/codeScan/hadolint.sh create mode 100644 .github/workflows/scripts/codeScan/trellix.sh create mode 100644 .pre-commit-config.yaml diff --git a/.github/code_spell_ignore.txt b/.github/code_spell_ignore.txt new file mode 100644 index 0000000..8d1c8b6 --- /dev/null +++ b/.github/code_spell_ignore.txt @@ -0,0 +1 @@ + diff --git a/.github/license_template.txt b/.github/license_template.txt new file mode 100644 index 0000000..a041037 --- /dev/null +++ b/.github/license_template.txt @@ -0,0 +1,2 @@ +Copyright (C) 2024 Intel Corporation +SPDX-License-Identifier: Apache-2.0 diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..25b5629 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,24 @@ +## Description + +The summary of the proposed changes as long as the relevant motivation and context. + +## Issues + +List the issue or RFC link this PR is working on. If there is no such link, please mark it as `n/a`. + +## Type of change + +List the type of change like below. Please delete options that are not relevant. + +- [ ] Bug fix (non-breaking change which fixes an issue) +- [ ] New feature (non-breaking change which adds new functionality) +- [ ] Breaking change (fix or feature that would break existing design and interface) +- [ ] Others (enhancement, documentation, validation, etc.) + +## Dependencies + +List the newly introduced 3rd party dependency if exists. + +## Tests + +Describe the tests that you ran to verify your changes. diff --git a/.github/workflows/change_color b/.github/workflows/change_color new file mode 100644 index 0000000..b2ea724 --- /dev/null +++ b/.github/workflows/change_color @@ -0,0 +1,80 @@ +#!/bin/bash + +# -------------- general approach start---------------- + +# 1. import this file: +# source path/change_color.sh +# 2. use COLOR/BG: +# $VARIABLE_NAME && out_put_content && $RESET +# 3. COLOR + BG: +# $COLOR/BG_VARIABLE_NAME && $BG/COLOR_VARIABLE_NAME && out_put_content && $RESET +# 4. custom +# abbreviation(change number) +# txt number range (30, 37) +# bg number range (40, 47) +# special effects number range (1, 7) +# echo -en \\E[number1 + ; + number2 + ; + number3 + m" +# e.g - BG_GRAY+LIGHT_RED = "echo -en \\E[47;31m" + +# -------------- general approach end----------------== + +# general setting +# ------------- light_color start---------------- +# black +LIGHT_BLACK="echo -en \\E[30m" +# red +LIGHT_RED="echo -en \\E[31m" +# green +LIGHT_GREEN="echo -en \\E[32m" +# yellow +LIGHT_YELLOW="echo -en \\E[33m" +# blue +LIGHT_BLUE="echo -en \\E[34m" +# purple +LIGHT_PURPLE="echo -en \\E[35m" +# cyan +LIGHT_CYAN="echo -en \\E[36m" +# gray +LIGHT_GRAY="echo -en \\E[37m" +# ------------- light_color end---------------- + +# ------------- bold_color start---------------- +# black +BOLD_BLACK="echo -en \\E[1;30m" +# red +BOLD_RED="echo -en \\E[1;31m" +# green +BOLD_GREEN="echo -en \\E[1;32m" +# yellow +BOLD_YELLOW="echo -en \\E[1;33m" +# blue +BOLD_BLUE="echo -en \\E[1;34m" +# purple +BOLD_PURPLE="echo -en \\E[1;35m" +# cyan +BOLD_CYAN="echo -en \\E[1;36m" +# gray +BOLD_GRAY="echo -en \\E[1;37m" +# ------------- bold_color end---------------- + +# ------------- background_color start---------------- +# black +BG_BLACK="echo -en \\E[40m" +# red +BG_RED="echo -en \\E[41m" +# green +BG_GREEN="echo -en \\E[42m" +# yellow +BG_YELLOW="echo -en \\E[43m" +# blue +BG_BLUE="echo -en \\E[44m" +# purple +BG_PURPLE="echo -en \\E[45m" +# cyan +BG_CYAN="echo -en \\E[46m" +# gray +BG_GRAY="echo -en \\E[47m" +# ------------- background_color end---------------- + +# close +RESET="echo -en \\E[0m" diff --git a/.github/workflows/docker/code-scan.dockerfile b/.github/workflows/docker/code-scan.dockerfile new file mode 100644 index 0000000..129e146 --- /dev/null +++ b/.github/workflows/docker/code-scan.dockerfile @@ -0,0 +1,23 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +ARG UBUNTU_VER=22.04 +FROM ubuntu:${UBUNTU_VER} as devel + +ENV LANG=C.UTF-8 + +RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ + aspell \ + aspell-en \ + build-essential \ + python3 \ + python3-pip \ + python3-dev \ + python3-distutils \ + wget + +RUN ln -sf $(which python3) /usr/bin/python + +RUN python -m pip install --no-cache-dir bandit + +WORKDIR / diff --git a/.github/workflows/docker/ut.dockerfile b/.github/workflows/docker/ut.dockerfile new file mode 100644 index 0000000..e51398e --- /dev/null +++ b/.github/workflows/docker/ut.dockerfile @@ -0,0 +1,23 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +ARG UBUNTU_VER=22.04 +FROM ubuntu:${UBUNTU_VER} as devel + +ENV LANG=C.UTF-8 + +RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ + aspell \ + aspell-en \ + build-essential \ + git \ + python3 \ + python3-dev \ + python3-distutils \ + python3-pip \ + wget + +RUN ln -sf $(which python3) /usr/bin/python +RUN python -m pip install --no-cache-dir pytest pytest-cov + +WORKDIR / diff --git a/.github/workflows/mix-code-scan.yml b/.github/workflows/mix-code-scan.yml new file mode 100644 index 0000000..b4b1498 --- /dev/null +++ b/.github/workflows/mix-code-scan.yml @@ -0,0 +1,66 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +name: Code Scan + +on: + pull_request: + branches: [main] + types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped + paths-ignore: + - "**.md" + workflow_dispatch: + +# If there is a new commit, the previous jobs will be canceled +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + DOCKER_CONFIG_NAME: "commonDockerConfig" + REPO_NAME: "code-scan" + REPO_TAG: "1.0" + DOCKER_FILE_NAME: "code-scan" + CONTAINER_NAME: "code-scan" + +jobs: + code-scan: + runs-on: ubuntu-latest + strategy: + matrix: + job_name: ["bandit", "hadolint"] + fail-fast: false + steps: + - name: Checkout out Repo + uses: actions/checkout@v4 + + - name: Check Dangerous Command Injection + uses: opea-project/validation/actions/check-cmd@main + with: + work_dir: ${{ github.workspace }} + + - name: Docker Build + run: | + docker build -f ${{ github.workspace }}/.github/workflows/docker/${{ env.DOCKER_FILE_NAME }}.dockerfile -t ${{ env.REPO_NAME }}:${{ env.REPO_TAG }} . + + - name: Docker Run + run: | + if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}'$) ]]; then + docker stop ${{ env.CONTAINER_NAME }} + docker rm -vf ${{ env.CONTAINER_NAME }} || true + fi + docker run -dit --memory="4g" --memory-reservation="1g" --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} --shm-size="1g" \ + -v ${{ github.workspace }}:/LangChinaOPEA \ + ${{ env.REPO_NAME }}:${{ env.REPO_TAG }} + + - name: Code scan check + run: | + docker exec ${{ env.CONTAINER_NAME }} \ + bash -c "bash /LangChinaOPEA/.github/workflows/scripts/codeScan/${{ matrix.job_name }}.sh" + + - name: Publish pipeline artifact + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.job_name }} + path: ${{ github.workspace }}/.github/workflows/scripts/codeScan/${{ matrix.job_name }}.* diff --git a/.github/workflows/mix-trellix.yml b/.github/workflows/mix-trellix.yml new file mode 100644 index 0000000..b0b3c73 --- /dev/null +++ b/.github/workflows/mix-trellix.yml @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +name: Trellix Command Line Scanner + +on: + workflow_dispatch: + pull_request: + branches: [main] + schedule: + - cron: "35 1 * * 6" + +jobs: + Trellix: + runs-on: trellix + steps: + - name: Clean Up Working Directory + run: sudo rm -rf ${{github.workspace}}/* + + - name: Checkout out Repo + uses: actions/checkout@v4 + + - name: Run Trellix Scanner + env: + workspace: ${{ github.workspace }} + run: bash .github/workflows/scripts/codeScan/trellix.sh + + - name: Publish pipeline artifact + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + path: ${{ github.workspace }}/.github/workflows/scripts/codeScan/report.html diff --git a/.github/workflows/pr-link-path-scan.yaml b/.github/workflows/pr-link-path-scan.yaml new file mode 100644 index 0000000..b7071a8 --- /dev/null +++ b/.github/workflows/pr-link-path-scan.yaml @@ -0,0 +1,153 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +name: Check hyperlinks and relative path validity + +on: + pull_request: + branches: [main] + types: [opened, reopened, ready_for_review, synchronize] + +# If there is a new commit, the previous jobs will be canceled +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + check-the-validity-of-hyperlinks-in-README: + runs-on: ubuntu-latest + steps: + - name: Clean Up Working Directory + run: sudo rm -rf ${{github.workspace}}/* + + - name: Checkout Repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Check the Validity of Hyperlinks + # ignore_links=("https://platform.openai.com/docs/api-reference/fine-tuning" + # "https://platform.openai.com/docs/api-reference/" + # "https://openai.com/index/whisper/" + # "https://platform.openai.com/docs/api-reference/chat/create") + run: | + cd ${{github.workspace}} + fail="FALSE" + merged_commit=$(git log -1 --format='%H') + changed_files="$(git diff --name-status --diff-filter=ARM ${{ github.event.pull_request.base.sha }} ${merged_commit} | awk '/\.md$/ {print $NF}')" + if [ -n "$changed_files" ]; then + for changed_file in $changed_files; do + echo $changed_file + url_lines=$(grep -H -Eo '\]\(http[s]?://[^)]+\)' "$changed_file" | grep -Ev 'LangChain-OPEA/blob/main') || true + if [ -n "$url_lines" ]; then + for url_line in $url_lines; do + url=$(echo "$url_line"|cut -d '(' -f2 | cut -d ')' -f1|sed 's/\.git$//') + path=$(echo "$url_line"|cut -d':' -f1 | cut -d'/' -f2-) + if [[ "$url" == "https://platform.openai.com/docs/api-reference/"* || "https://www.docker.com/get-started" == "$url" || "https://openai.com/index/whisper/" == "$url" ]]; then + echo "Link "$url" from ${{github.workspace}}/$path need to be verified by a real person." + else + response=$(curl -L -s -o /dev/null -w "%{http_code}" "$url") + if [ "$response" -ne 200 ]; then + echo "**********Validation failed, try again**********" + response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url") + if [ "$response_retry" -eq 200 ]; then + echo "*****Retry successfully*****" + else + echo "Invalid link from ${{github.workspace}}/$path: $url" + fail="TRUE" + fi + fi + fi + done + fi + done + else + echo "No changed .md file." + fi + + if [[ "$fail" == "TRUE" ]]; then + exit 1 + else + echo "All hyperlinks are valid." + fi + shell: bash + + check-the-validity-of-relative-path: + runs-on: ubuntu-latest + steps: + - name: Clean up Working Directory + run: sudo rm -rf ${{github.workspace}}/* + + - name: Checkout Repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Checking Relative Path Validity + run: | + cd ${{github.workspace}} + fail="FALSE" + repo_name=${{ github.event.pull_request.head.repo.full_name }} + if [ "$(echo "$repo_name"|cut -d'/' -f1)" != "opea-project" ]; then + owner=$(echo "${{ github.event.pull_request.head.repo.full_name }}" |cut -d'/' -f1) + branch="https://github.com/$owner/LangChain-OPEA/tree/${{ github.event.pull_request.head.ref }}" + else + branch="https://github.com/opea-project/LangChain-OPEA/blob/${{ github.event.pull_request.head.ref }}" + fi + link_head="https://github.com/opea-project/LangChain-OPEA/blob/main" + + merged_commit=$(git log -1 --format='%H') + changed_files="$(git diff --name-status --diff-filter=ARM ${{ github.event.pull_request.base.sha }} ${merged_commit} | awk '/\.md$/ {print $NF}')" + png_lines=$(grep -Eo '\]\([^)]+\)' --include='*.md' -r .|grep -Ev 'http') + if [ -n "$png_lines" ]; then + for png_line in $png_lines; do + refer_path=$(echo "$png_line"|cut -d':' -f1 | cut -d'/' -f2-) + png_path=$(echo "$png_line"|cut -d '(' -f2 | cut -d ')' -f1) + + if [[ "${png_path:0:1}" == "/" ]]; then + check_path=$png_path + elif [[ "$png_path" == *#* ]]; then + relative_path=$(echo "$png_path" | cut -d '#' -f1) + if [ -n "$relative_path" ]; then + check_path=$(dirname "$refer_path")/$relative_path + png_path=$(echo "$png_path" | awk -F'#' '{print "#" $2}') + else + check_path=$refer_path + fi + else + check_path=$(dirname "$refer_path")/$png_path + fi + + if [ -e "$check_path" ]; then + real_path=$(realpath $check_path) + if [[ "$png_line" == *#* ]]; then + if [ -n "changed_files" ] && echo "$changed_files" | grep -q "^${refer_path}$"; then + url_dev=$branch$(echo "$real_path" | sed 's|.*/LangChain-OPEA||')$png_path + response=$(curl -I -L -s -o /dev/null -w "%{http_code}" "$url_dev") + if [ "$response" -ne 200 ]; then + echo "**********Validation failed, try again**********" + response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url_dev") + if [ "$response_retry" -eq 200 ]; then + echo "*****Retry successfully*****" + else + echo "Invalid path from ${{github.workspace}}/$refer_path: $png_path, link: $url_dev" + fail="TRUE" + fi + else + echo "Validation succeed $png_line" + fi + fi + fi + else + echo "$check_path does not exist" + fail="TRUE" + fi + done + fi + + if [[ "$fail" == "TRUE" ]]; then + exit 1 + else + echo "All hyperlinks are valid." + fi + shell: bash diff --git a/.github/workflows/scripts/codeScan/bandit.sh b/.github/workflows/scripts/codeScan/bandit.sh new file mode 100644 index 0000000..2b6c1aa --- /dev/null +++ b/.github/workflows/scripts/codeScan/bandit.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +source /LangChinaOPEA/.github/workflows/scripts/change_color +pip install --no-cache-dir bandit==1.7.8 +log_dir=/LangChinaOPEA/.github/workflows/scripts/codeScan +python -m bandit -r -lll -iii /LangChinaOPEA > ${log_dir}/bandit.log +exit_code=$? + +$BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" +cat ${log_dir}/bandit.log +$BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET + +if [ ${exit_code} -ne 0 ]; then + $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Bandit error details." && $RESET + exit 1 +fi + +$BOLD_PURPLE && echo "Congratulations, Bandit check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET +exit 0 diff --git a/.github/workflows/scripts/codeScan/hadolint.sh b/.github/workflows/scripts/codeScan/hadolint.sh new file mode 100644 index 0000000..dda28e4 --- /dev/null +++ b/.github/workflows/scripts/codeScan/hadolint.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +source /LangChinaOPEA/.github/workflows/scripts/change_color +log_dir=/LangChinaOPEA/.github/workflows/scripts/codeScan + +find . -type f \( -name "Dockerfile*" \) -print -exec hadolint --ignore DL3006 --ignore DL3007 --ignore DL3008 {} \; 2>&1 | tee ${log_dir}/hadolint.log + +if [[ $(grep -c "error" ${log_dir}/hadolint.log) != 0 ]]; then + $BOLD_RED && echo "Error!! Please Click on the artifact button to download and check error details." && $RESET + exit 1 +fi + +$BOLD_PURPLE && echo "Congratulations, Hadolint check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET +exit 0 diff --git a/.github/workflows/scripts/codeScan/trellix.sh b/.github/workflows/scripts/codeScan/trellix.sh new file mode 100644 index 0000000..909b2d1 --- /dev/null +++ b/.github/workflows/scripts/codeScan/trellix.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +source ${workspace}/.github/workflows/scripts/change_color +log_dir=${workspace}/.github/workflows/scripts/codeScan + + +echo "---Updating definition (DAT) files ---" +DEFS_URL=https://update.nai.com/products/commonupdater/current/vscandat1000/dat/0000 +echo "Finding latest defs at $DEFS_URL/avvdat.ini..." \ + && wget -q $DEFS_URL/avvdat.ini \ + && echo "SUCCESS" || fail + +inifile="avvdat.ini" +filename=`awk -F"=" '$2 ~ /avvdat.*zip/ { print $2 } ' $inifile` +filename2="$(echo -e "${filename}" | tr -d '[:space:]')" + +if [ -z "$filename2" ] +then + echo "Cannot get defs information from INI file:" + cat $inifile + fail +fi + +echo "Downloading latest defs from $DEFS_URL/$filename2..." \ + && wget -q $DEFS_URL/$filename2 \ + && echo "SUCCESS" || fail + +echo "Extracting latest defs..." \ + && unzip -o $filename2 -d /usr/local/uvscan \ + && echo "SUCCESS" || fail + +echo "--- Scanning ---" +ENV_SCAN_OPTS="--analyze --mime --program --recursive --unzip --threads 4 --summary --verbose --html=${workspace}/.github/workflows/scripts/codeScan/report.html" +echo "Scan Options: $ENV_SCAN_OPTS" + +rm -r ${workspace}/avvdat* +rm -r ${workspace}/.git +uvscan $ENV_SCAN_OPTS ${workspace} 2>&1 | tee ${log_dir}/trellix.log + +if [[ $(grep "Possibly Infected" ${log_dir}/trellix.log | sed 's/[^0-9]//g') != 0 ]]; then + $BOLD_RED && echo "Error!! Please Click on the artifact button to download and check error details." && $RESET + exit 1 +fi + +$BOLD_PURPLE && echo "Congratulations, Trellix Scan passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET +exit 0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..16e6790 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,129 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +ci: + autofix_prs: true + autoupdate_schedule: quarterly + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: end-of-file-fixer + files: (.*\.(py|md|rst|yaml|yml|json|ts|js|html|svelte|sh))$ + - id: check-json + - id: check-yaml + args: [--allow-multiple-documents] + - id: debug-statements + - id: mixed-line-ending + args: [--fix=lf] + - id: requirements-txt-fixer + - id: trailing-whitespace + files: (.*\.(py|rst|cmake|yaml|yml|json|ts|js|html|svelte|sh))$ + + - repo: https://github.com/Lucas-C/pre-commit-hooks + rev: v1.5.5 + hooks: + - id: insert-license + files: (Dockerfile)$ + args: + [ + --license-filepath=.github/license_template.txt, + --use-current-year, + --detect-license-in-X-top-lines=5, + --skip-license-insertion-comment=Copyright, + ] + - id: insert-license + files: (.*\.(py|yaml|yml|sh))$ + args: + [ + --license-filepath=.github/license_template.txt, + --use-current-year, + --detect-license-in-X-top-lines=5, + --skip-license-insertion-comment=Copyright, + ] + - id: insert-license + files: (.*\.(ts|js))$ + args: + [ + --license-filepath=.github/license_template.txt, + --use-current-year, + --detect-license-in-X-top-lines=5, + --skip-license-insertion-comment=Copyright, + --comment-style=//, + ] + - id: insert-license + files: (.*\.(html|svelte))$ + args: + [ + --license-filepath=.github/license_template.txt, + --use-current-year, + --detect-license-in-X-top-lines=5, + --skip-license-insertion-comment=Copyright, + --comment-style=, + ] + + - repo: https://github.com/asottile/yesqa + rev: v1.5.0 + hooks: + - id: yesqa + name: Unused noqa + + - repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + + - repo: https://github.com/PyCQA/docformatter + rev: 06907d0 + hooks: + - id: docformatter + args: [ + --in-place, + --wrap-summaries=0, # 0 means disable wrap + --wrap-descriptions=0, # 0 means disable wrap + --black, + --style=google, + ] + + - repo: local + hooks: + - id: prettier + name: prettier + description: "" + entry: prettier --write --ignore-unknown + language: node + "types": [text] + args: [--print-width=120] + types_or: [markdown, html, css, scss, javascript, json, ts, shell, sh] + require_serial: false + additional_dependencies: ["prettier@latest"] + minimum_pre_commit_version: "0" + + - repo: https://github.com/psf/black.git + rev: 24.4.2 + hooks: + - id: black + files: (.*\.py)$ + + - repo: https://github.com/asottile/blacken-docs + rev: 1.18.0 + hooks: + - id: blacken-docs + args: [--line-length=120, --skip-errors] + additional_dependencies: + - black==24.3.0 + + - repo: https://github.com/codespell-project/codespell + rev: v2.3.0 + hooks: + - id: codespell + args: [-w] + additional_dependencies: + - tomli + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.5.0 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix, --no-cache] diff --git a/pyproject.toml b/pyproject.toml index c185e41..4fd568e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,3 +60,112 @@ ruff = "^0.5" [tool.poetry.group.typing.dependencies] mypy = "^1.10" + +[tool.isort] +profile = "black" +line_length = 120 +extend_skip_glob = ["**/__init__.py"] + + +[tool.black] +line-length = 120 + + +[tool.codespell] +skip = '*.po,*.js,*.map,*.js.map,*.css.map,*.json,*.sql' +count = '' +quiet-level = 3 +ignore-words = ".github/code_spell_ignore.txt" + + +[tool.ruff] +# Exclude a variety of commonly ignored directories. +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".ipynb_checkpoints", + ".mypy_cache", + ".nox", + ".pants.d", + ".pyenv", + ".pytest_cache", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "site-packages", + "venv", +] + +# Same as Black. +line-length = 120 +indent-width = 4 + +# Assume Python 3.10 +target-version = "py310" + +[tool.ruff.lint] +# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. +# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or +# McCabe complexity (`C901`) by default. +select = ["E4", "E7", "E9", "F"] +ignore = [ + "E402", # Module level import not at top of file + "E501", # Line too long (121 > 120 characters) + "E721", # Do not compare types, use isinstance() + "E722", # Do not use bare except + "E731", # Do not assign a lambda expression, use a def + "E741", # Do not use variables named ‘l’, ‘O’, or ‘I’ + "F401", # {name} imported but unused; Ignore F401 because ruff can't fix conditional imports + "F403", # from {name} import * used; unable to detect undefined names + "F405", # {name} may be undefined, or defined from star imports + "F841", # Local variable is assigned to but never used{name} +] + +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + +# Allow unused variables when underscore-prefixed. +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +ignore-init-module-imports = true + +[tool.ruff.format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" + +# Enable auto-formatting of code examples in docstrings. Markdown, +# reStructuredText code/literal blocks and doctests are all supported. +# +# This is currently disabled by default, but it is planned for this +# to be opt-out in the future. +docstring-code-format = false + +# Set the line length limit used when formatting code snippets in +# docstrings. +# +# This only has an effect when the `docstring-code-format` setting is +# enabled. +docstring-code-line-length = "dynamic" diff --git a/samples/README.md b/samples/README.md index 340c794..e6c8b97 100644 --- a/samples/README.md +++ b/samples/README.md @@ -13,7 +13,7 @@ Data Privacy: By running models locally, you ensure that sensitive data does not Reduced Latency: Local inference eliminates the need for network calls to external services, resulting in faster response times. -Flexibility: You can bring your own OPEA validated [models](https://github.com/opea-project/GenAIComps/blob/main/comps/llms/text-generation/README.md#validated-llm-models) and switch between different models as needed, tailoring the solution to your specific requirements. +Flexibility: You can bring your own OPEA validated [models](https://github.com/opea-project/GenAIComps/blob/main/comps/llms/src/text-generation/README.md#validated-llm-models) and switch between different models as needed, tailoring the solution to your specific requirements. To run the services, set up the environment variables: From 9a39ce48c528f9c62173967eb8df13457f8a41d3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 7 Mar 2025 03:10:52 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- README.md | 3 +- langchain_opea/integrations/gaudiutils.py | 20 ++--- langchain_opea/native_chat_models.py | 84 ++++++--------------- samples/README.md | 7 +- tests/integration_tests/test_chat_models.py | 3 +- tests/integration_tests/test_embeddings.py | 3 +- 6 files changed, 42 insertions(+), 78 deletions(-) diff --git a/README.md b/README.md index 240f4be..0a0e025 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,6 @@ To install the package from a pre-built wheel, run: ```bash poetry build ``` - 2. **Install via Wheel File**: Install the package using the generated wheel file. ```bash pip install dist/langchain_opea-0.1.0-py3-none-any.whl @@ -71,4 +70,4 @@ llm = OPEALLM( llm.invoke("The meaning of life is") ``` -Check out [Samples](./samples/README.md) for more examples using the OPEA Langchain package. \ No newline at end of file +Check out [Samples](./samples/README.md) for more examples using the OPEA Langchain package. diff --git a/langchain_opea/integrations/gaudiutils.py b/langchain_opea/integrations/gaudiutils.py index b03ddb7..2916503 100644 --- a/langchain_opea/integrations/gaudiutils.py +++ b/langchain_opea/integrations/gaudiutils.py @@ -1,16 +1,16 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse import copy import glob import os -import argparse import shutil import tempfile import time from pathlib import Path import torch -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer -from transformers.utils import check_min_version - from optimum.habana.checkpoint_utils import ( get_ds_injection_policy, get_repo_root, @@ -24,6 +24,9 @@ get_habana_frameworks_version, set_seed, ) +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from transformers.utils import check_min_version + def setup_parser(parser): # Arguments management @@ -32,7 +35,7 @@ def setup_parser(parser): "--model_name_or_path", default=None, type=str, - #required=True, + # required=True, help="Path to pre-trained model (on the HF Hub or locally).", ) parser.add_argument( @@ -245,7 +248,7 @@ def setup_parser(parser): parser.add_argument( "--book_source", action="store_true", - help="Whether to use project Guttenberg books data as input. Usefull for testing large sequence lenghts.", + help="Whether to use project Guttenberg books data as input. Useful for testing large sequence lengths.", ) parser.add_argument( "--torch_compile", @@ -291,6 +294,7 @@ def setup_parser(parser): ) return args + def adjust_batch(batch, size): curr_size = batch["input_ids"].shape[1] if curr_size >= size: @@ -413,7 +417,6 @@ def setup_device(args): # patching LinearAllreduce to use ScopedLinearAllReduce def patch_scoped_linear_all_reduce(model): from deepspeed.module_inject.layers import LinearAllreduce - from optimum.habana.transformers.models.modeling_all_models import ScopedLinearAllReduce for name, module in model.named_children(): @@ -458,7 +461,7 @@ def setup_model(args, model_dtype, model_kwargs, logger): if args.peft_model is not None: model = peft_model(args, model_dtype, logger, **model_kwargs) else: - if args.model_name_or_path=="neo4j/text2cypher-gemma-2-9b-it-finetuned-2024v1": + if args.model_name_or_path == "neo4j/text2cypher-gemma-2-9b-it-finetuned-2024v1": model = AutoModelForCausalLM.from_pretrained( "google/gemma-2-9b-it", torch_dtype=model_dtype, **model_kwargs ) @@ -480,7 +483,6 @@ def setup_model(args, model_dtype, model_kwargs, logger): if args.use_hpu_graphs: from habana_frameworks.torch.hpu import wrap_in_hpu_graph - from optimum.habana.transformers.trainer import _is_peft_model if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon": diff --git a/langchain_opea/native_chat_models.py b/langchain_opea/native_chat_models.py index 048288b..01e07ec 100644 --- a/langchain_opea/native_chat_models.py +++ b/langchain_opea/native_chat_models.py @@ -1,39 +1,25 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 """Native Chat Wrapper.""" -from typing import Any, AsyncIterator, Iterator, List, Optional import logging +from typing import Any, AsyncIterator, Iterator, List, Optional + from langchain_core._api.deprecation import deprecated -from langchain_core.callbacks.manager import ( - AsyncCallbackManagerForLLMRun, - CallbackManagerForLLMRun, -) -from langchain_core.language_models.chat_models import ( - BaseChatModel, - agenerate_from_stream, - generate_from_stream, -) -from langchain_core.messages import ( - AIMessage, - AIMessageChunk, - BaseMessage, - HumanMessage, - SystemMessage, -) -from langchain_core.outputs import ( - ChatGeneration, - ChatGenerationChunk, - ChatResult, - LLMResult, -) -from pydantic import model_validator -from typing_extensions import Self +from langchain_core.callbacks.manager import AsyncCallbackManagerForLLMRun, CallbackManagerForLLMRun +from langchain_core.language_models.chat_models import BaseChatModel, agenerate_from_stream, generate_from_stream +from langchain_core.messages import AIMessage, AIMessageChunk, BaseMessage, HumanMessage, SystemMessage +from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult, LLMResult from langchain_huggingface.llms.huggingface_pipeline import HuggingFacePipeline from pipeline import GaudiTextGenerationPipeline +from pydantic import model_validator +from typing_extensions import Self DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful, and honest assistant.""" DEFAULT_MODEL_ID = "Intel/neural-chat-7b-v3-3" logger = logging.getLogger(__name__) + class AttributeContainer: def __init__(self, **kwargs): # Set attributes dynamically based on keyword arguments @@ -96,9 +82,9 @@ def __init__(self, **kwargs): bucket_internal=False, ) + class ChatNative(BaseChatModel): - """ - Wrapper for using LLMs run on Intel Gaudi as ChatModels. + """Wrapper for using LLMs run on Intel Gaudi as ChatModels. To use, you should have the `mlflow[genai]` python package installed. For more information, see https://mlflow.org/docs/latest/llms/deployments. @@ -114,8 +100,7 @@ class ChatNative(BaseChatModel): """ llm: Any - """LLM, must be of type HuggingFacePipeline - """ + """LLM, must be of type HuggingFacePipeline.""" system_message: SystemMessage = SystemMessage(content=DEFAULT_SYSTEM_PROMPT) tokenizer: Any = None model_id: Optional[str] = None @@ -129,28 +114,20 @@ def __init__(self, **kwargs: Any): args.model_name_or_path = self.model_name if self.device == "hpu": - pipe = GaudiTextGenerationPipeline( - args, - logger, - use_with_langchain=True - ) + pipe = GaudiTextGenerationPipeline(args, logger, use_with_langchain=True) hfpipe = HuggingFacePipeline(pipeline=pipe) self.llm = hfpipe self.tokenizer = pipe.tokenizer else: raise NotImplementedError(f"Only support hpu device now, device {self.device} not supported.") - @model_validator(mode="after") def validate_llm(self) -> Self: if not isinstance( self.llm, (HuggingFacePipeline), ): - raise TypeError( - "Expected llm to be one of HuggingFacePipeline" - f", received {type(self.llm)}" - ) + raise TypeError("Expected llm to be one of HuggingFacePipeline" f", received {type(self.llm)}") return self def _stream( @@ -192,15 +169,11 @@ def _generate( **kwargs: Any, ) -> ChatResult: if self.streaming: - stream_iter = self._stream( - messages, stop=stop, run_manager=run_manager, **kwargs - ) + stream_iter = self._stream(messages, stop=stop, run_manager=run_manager, **kwargs) return generate_from_stream(stream_iter) llm_input = self._to_chat_prompt(messages) - llm_result = self.llm._generate( - prompts=[llm_input], stop=stop, run_manager=run_manager, **kwargs - ) + llm_result = self.llm._generate(prompts=[llm_input], stop=stop, run_manager=run_manager, **kwargs) return self._to_chat_result(llm_result) async def _agenerate( @@ -211,15 +184,11 @@ async def _agenerate( **kwargs: Any, ) -> ChatResult: if self.streaming: - stream_iter = self._astream( - messages, stop=stop, run_manager=run_manager, **kwargs - ) + stream_iter = self._astream(messages, stop=stop, run_manager=run_manager, **kwargs) return await agenerate_from_stream(stream_iter) llm_input = self._to_chat_prompt(messages) - llm_result = await self.llm._agenerate( - prompts=[llm_input], stop=stop, run_manager=run_manager, **kwargs - ) + llm_result = await self.llm._agenerate(prompts=[llm_input], stop=stop, run_manager=run_manager, **kwargs) return self._to_chat_result(llm_result) def _to_chat_prompt( @@ -235,9 +204,7 @@ def _to_chat_prompt( messages_dicts = [self._to_chatml_format(m) for m in messages] - return self.tokenizer.apply_chat_template( - messages_dicts, tokenize=False, add_generation_prompt=True - ) + return self.tokenizer.apply_chat_template(messages_dicts, tokenize=False, add_generation_prompt=True) def _to_chatml_format(self, message: BaseMessage) -> dict: """Convert LangChain message to ChatML format.""" @@ -258,16 +225,11 @@ def _to_chat_result(llm_result: LLMResult) -> ChatResult: chat_generations = [] for g in llm_result.generations[0]: - chat_generation = ChatGeneration( - message=AIMessage(content=g.text), generation_info=g.generation_info - ) + chat_generation = ChatGeneration(message=AIMessage(content=g.text), generation_info=g.generation_info) chat_generations.append(chat_generation) - return ChatResult( - generations=chat_generations, llm_output=llm_result.llm_output - ) + return ChatResult(generations=chat_generations, llm_output=llm_result.llm_output) @property def _llm_type(self) -> str: return "gaudi-chat-wrapper" - diff --git a/samples/README.md b/samples/README.md index e6c8b97..a2c418c 100644 --- a/samples/README.md +++ b/samples/README.md @@ -1,11 +1,11 @@ # Running Langchain OPEA SDK with OPEA microservices -The OPEA Langchain SDK facilitates effortless interaction with open-source large language models, such as Llama 3, directly on your local machine. To leverage the SDK, you need to deploy an OpenAI compatible model serving. +The OPEA Langchain SDK facilitates effortless interaction with open-source large language models, such as Llama 3, directly on your local machine. To leverage the SDK, you need to deploy an OpenAI compatible model serving. This local microservice deployment is crucial for harnessing the power of advanced language models while ensuring data privacy, reducing latency, and providing the flexibility to select models without relying on external APIs. ## 1. Starting the microservices using compose -A prerequisite for using Langchain OPEA SDK is that users must have OpenAI compatible LLM text/embeddings generation service (etc., TGI, vLLM) already running. Langchain OPEA SDK package uses these deployed endpoints to help create end to end enterprise generative AI solutions. +A prerequisite for using Langchain OPEA SDK is that users must have OpenAI compatible LLM text/embeddings generation service (etc., TGI, vLLM) already running. Langchain OPEA SDK package uses these deployed endpoints to help create end to end enterprise generative AI solutions. This approach offers several benefits: @@ -66,12 +66,11 @@ To install the package from a pre-built wheel, run: ```bash poetry build ``` - 2. **Install via Wheel File**: Install the package using the generated wheel file. ```bash pip install dist/langchain_opea-0.1.0-py3-none-any.whl ``` - + ## 4. Install Jupyter Notebook ```bash diff --git a/tests/integration_tests/test_chat_models.py b/tests/integration_tests/test_chat_models.py index 22e55da..da1f2bb 100644 --- a/tests/integration_tests/test_chat_models.py +++ b/tests/integration_tests/test_chat_models.py @@ -6,9 +6,10 @@ import pytest from langchain_core.language_models import BaseChatModel -from langchain_opea.chat_models import ChatOPEA from langchain_tests.integration_tests import ChatModelIntegrationTests +from langchain_opea.chat_models import ChatOPEA + OPEA_API_BASE = "http://localhost:9009/v1" OPEA_API_KEY = "my_secret_value" MODEL_NAME = "Intel/neural-chat-7b-v3-3" diff --git a/tests/integration_tests/test_embeddings.py b/tests/integration_tests/test_embeddings.py index 519d51a..03ca585 100644 --- a/tests/integration_tests/test_embeddings.py +++ b/tests/integration_tests/test_embeddings.py @@ -4,9 +4,10 @@ from typing import Type -from langchain_opea.embeddings import OPEAEmbeddings from langchain_tests.integration_tests import EmbeddingsIntegrationTests +from langchain_opea.embeddings import OPEAEmbeddings + OPEA_API_BASE = "http://localhost:6006/v1" OPEA_API_KEY = "my_secret_value" MODEL_NAME = "BAAI/bge-large-en-v1.5" From 11c65ea953c81eaedacabc606243936827def427 Mon Sep 17 00:00:00 2001 From: Wenxin Zhang Date: Fri, 7 Mar 2025 11:25:08 +0800 Subject: [PATCH 3/4] update Signed-off-by: Wenxin Zhang --- .github/workflows/mix-trellix.yml | 2 +- .github/workflows/{ => scripts}/change_color | 0 langchain_opea/integrations/gaudiutils.py | 3 +++ langchain_opea/native_chat_models.py | 3 ++- 4 files changed, 6 insertions(+), 2 deletions(-) rename .github/workflows/{ => scripts}/change_color (100%) diff --git a/.github/workflows/mix-trellix.yml b/.github/workflows/mix-trellix.yml index b0b3c73..4fd76bf 100644 --- a/.github/workflows/mix-trellix.yml +++ b/.github/workflows/mix-trellix.yml @@ -12,7 +12,7 @@ on: jobs: Trellix: - runs-on: trellix + runs-on: ubuntu-latest steps: - name: Clean Up Working Directory run: sudo rm -rf ${{github.workspace}}/* diff --git a/.github/workflows/change_color b/.github/workflows/scripts/change_color similarity index 100% rename from .github/workflows/change_color rename to .github/workflows/scripts/change_color diff --git a/langchain_opea/integrations/gaudiutils.py b/langchain_opea/integrations/gaudiutils.py index b03ddb7..0fe34a8 100644 --- a/langchain_opea/integrations/gaudiutils.py +++ b/langchain_opea/integrations/gaudiutils.py @@ -5,6 +5,7 @@ import shutil import tempfile import time +import logging from pathlib import Path import torch @@ -25,6 +26,8 @@ set_seed, ) +logger = logging.getLogger(__name__) + def setup_parser(parser): # Arguments management parser.add_argument("--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu") diff --git a/langchain_opea/native_chat_models.py b/langchain_opea/native_chat_models.py index 048288b..1a3beb3 100644 --- a/langchain_opea/native_chat_models.py +++ b/langchain_opea/native_chat_models.py @@ -2,6 +2,7 @@ from typing import Any, AsyncIterator, Iterator, List, Optional import logging +import os from langchain_core._api.deprecation import deprecated from langchain_core.callbacks.manager import ( AsyncCallbackManagerForLLMRun, @@ -25,7 +26,7 @@ ChatResult, LLMResult, ) -from pydantic import model_validator +from pydantic import model_validator, Field from typing_extensions import Self from langchain_huggingface.llms.huggingface_pipeline import HuggingFacePipeline from pipeline import GaudiTextGenerationPipeline From 5ff07e47510665598828faa4dcff2a726849ccf4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 7 Mar 2025 03:33:02 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- langchain_opea/integrations/gaudiutils.py | 4 ++-- langchain_opea/native_chat_models.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/langchain_opea/integrations/gaudiutils.py b/langchain_opea/integrations/gaudiutils.py index 49f521a..f2b7703 100644 --- a/langchain_opea/integrations/gaudiutils.py +++ b/langchain_opea/integrations/gaudiutils.py @@ -4,11 +4,11 @@ import argparse import copy import glob +import logging import os import shutil import tempfile import time -import logging from pathlib import Path import torch @@ -28,9 +28,9 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.utils import check_min_version - logger = logging.getLogger(__name__) + def setup_parser(parser): # Arguments management parser.add_argument("--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu") diff --git a/langchain_opea/native_chat_models.py b/langchain_opea/native_chat_models.py index 2a23740..2b21166 100644 --- a/langchain_opea/native_chat_models.py +++ b/langchain_opea/native_chat_models.py @@ -13,7 +13,7 @@ from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult, LLMResult from langchain_huggingface.llms.huggingface_pipeline import HuggingFacePipeline from pipeline import GaudiTextGenerationPipeline -from pydantic import model_validator, Field +from pydantic import Field, model_validator from typing_extensions import Self DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful, and honest assistant."""