diff --git a/.github/workflows/model_test_cpu.yml b/.github/workflows/model_test_cpu.yml index 370316d1..822d73ba 100644 --- a/.github/workflows/model_test_cpu.yml +++ b/.github/workflows/model_test_cpu.yml @@ -32,6 +32,7 @@ env: jobs: Evaluation-Workflow: runs-on: aise-cluster-cpu + timeout-minutes: 10 strategy: matrix: include: @@ -113,21 +114,6 @@ jobs: cd ${{ env.OUT_SCRIPT_PATH }} ls -R - - name: Download Reference Artifact - id: download-artifact - uses: dawidd6/action-download-artifact@v3.1.2 - with: - workflow: model_test_cpu.yml - name: FinalReport - run_id: ${{ vars.ModelTest_CPU_REF_ID }} - path: ${{ env.OUT_SCRIPT_PATH }} - name_is_regexp: true - repo: ${{ github.repository }} - check_artifacts: false - search_artifacts: false - skip_unpack: false - if_no_artifact_found: warn - - name: Display structure of downloaded files run: cd ${{ env.OUT_SCRIPT_PATH }}/log && ls -R @@ -135,24 +121,4 @@ jobs: run: | echo "------ Generating final report.html ------" cd ${{ env.OUT_SCRIPT_PATH }} - mkdir -p generated /usr/bin/bash -x generate_report.sh - env: - RUN_DISPLAY_URL: https://https://github.com/opea-project/GenAIEval/actions/runs/${{ github.run_id }} - BUILD_NUMBER: ${{ github.run_id }} - JOB_STATUS: succeed - - - name: Publish Report - uses: actions/upload-artifact@v4 - if: ${{ !cancelled() }} - with: - name: FinalReport - path: ${{ env.OUT_SCRIPT_PATH }}/generated - - - name: Specify performance regression - if: ${{ !cancelled() }} - run: | - if [ ${{ env.is_perf_reg }} == 'true' ]; then - echo "[Performance Regression] Some model performance regression occurred, please check artifacts and reports." - exit 1 - fi diff --git a/.github/workflows/model_test_hpu.yml b/.github/workflows/model_test_hpu.yml index 3bea0b81..204d9044 100644 --- a/.github/workflows/model_test_hpu.yml +++ b/.github/workflows/model_test_hpu.yml @@ -32,6 +32,7 @@ env: jobs: Evaluation-Workflow: runs-on: aise-cluster-hpu + timeout-minutes: 10 strategy: matrix: include: @@ -114,21 +115,6 @@ jobs: cd ${{ env.OUT_SCRIPT_PATH }} ls -R - - name: Download Reference Artifact - id: download-artifact - uses: dawidd6/action-download-artifact@v3.1.2 - with: - workflow: model_test_hpu.yml - name: FinalReport - run_id: ${{ vars.ModelTest_HPU_REF_ID }} - path: ${{ env.OUT_SCRIPT_PATH }} - name_is_regexp: true - repo: ${{ github.repository }} - check_artifacts: false - search_artifacts: false - skip_unpack: false - if_no_artifact_found: warn - - name: Display structure of downloaded files run: cd ${{ env.OUT_SCRIPT_PATH }}/log && ls -R @@ -136,24 +122,4 @@ jobs: run: | echo "------ Generating final report.html ------" cd ${{ env.OUT_SCRIPT_PATH }} - mkdir -p generated /usr/bin/bash -x generate_report.sh - env: - RUN_DISPLAY_URL: https://https://github.com/opea-project/GenAIEval/actions/runs/${{ github.run_id }} - BUILD_NUMBER: ${{ github.run_id }} - JOB_STATUS: succeed - - - name: Publish Report - uses: actions/upload-artifact@v4 - if: ${{ !cancelled() }} - with: - name: FinalReport - path: ${{ env.OUT_SCRIPT_PATH }}/generated - - - name: Specify performance regression - if: ${{ !cancelled() }} - run: | - if [ ${{ env.is_perf_reg }} == 'true' ]; then - echo "[Performance Regression] Some model performance regression occurred, please check artifacts and reports." - exit 1 - fi diff --git a/.github/workflows/scripts/models/collect_log.sh b/.github/workflows/scripts/models/collect_log.sh index d9c36650..936843ef 100644 --- a/.github/workflows/scripts/models/collect_log.sh +++ b/.github/workflows/scripts/models/collect_log.sh @@ -5,34 +5,39 @@ set -eo pipefail set -x -source /GenAIEval/.github/workflows/scripts/change_color + WORKSPACE="/GenAIEval" # get parameters PATTERN='[-a-zA-Z0-9_]*=' PERF_STABLE_CHECK=true for i in "$@"; do case $i in - --datasets*) - datasets=`echo $i | sed "s/${PATTERN}//"`;; - --device=*) - device=`echo $i | sed "s/${PATTERN}//"`;; - --model=*) - model=`echo $i | sed "s/${PATTERN}//"`;; - --tasks=*) - tasks=`echo $i | sed "s/${PATTERN}//"`;; - *) - echo "Parameter $i not recognized."; exit 1;; + --datasets*) + datasets=$(echo $i | sed "s/${PATTERN}//") + ;; + --device=*) + device=$(echo $i | sed "s/${PATTERN}//") + ;; + --model=*) + model=$(echo $i | sed "s/${PATTERN}//") + ;; + --tasks=*) + tasks=$(echo $i | sed "s/${PATTERN}//") + ;; + *) + echo "Parameter $i not recognized." + exit 1 + ;; esac done log_file="/log/${device}/${model}/${device}-${tasks}-${model}-${datasets}.log" -$BOLD_YELLOW && echo "-------- Collect logs --------" && $RESET - +echo "Collecting logs ......" echo "working in" pwd if [[ ! -f ${log_file} ]]; then - echo "${device};${model};${tasks};${datasets};;" >> ${WORKSPACE}/summary.log + echo "|${device}|${model}|${tasks}|${datasets}|NaN|" >>${WORKSPACE}/summary.log else acc=$(grep -Po "acc .*(\d+(\.\d+)?)" ${log_file} | awk -F "|" '{print $3}' | head -n 1 | sed 's/.*://;s/[^0-9.]//g') - echo "${device};${model};${tasks};${datasets};${acc};" >> ${WORKSPACE}/summary.log + echo "|${device}|${model}|${tasks}|${datasets}|${acc}|" >>${WORKSPACE}/summary.log fi diff --git a/.github/workflows/scripts/models/generate_report.sh b/.github/workflows/scripts/models/generate_report.sh index f79d7ab8..0783395a 100644 --- a/.github/workflows/scripts/models/generate_report.sh +++ b/.github/workflows/scripts/models/generate_report.sh @@ -3,256 +3,25 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -set -x -WORKSPACE=generated -last_log_path=FinalReport -summaryLog=${WORKSPACE}/summary.log -summaryLogLast=${last_log_path}/summary.log -PATTERN='[-a-zA-Z0-9_]*=' +set -xe -function main { - echo "summaryLog: ${summaryLog}" - echo "summaryLogLast: ${summaryLogLast}" - echo "is_perf_reg=false" >> "$GITHUB_ENV" - preprocessing - generate_html_head - generate_html_overview - generate_results - generate_html_footer +function generate_header { + echo "### Model Test Summary" >>$GITHUB_STEP_SUMMARY + echo "|device|model|tasks|datasets|acc|" >>$GITHUB_STEP_SUMMARY + echo "| :----: | :----: | :----: | :----: | :----: |" >>$GITHUB_STEP_SUMMARY } function preprocessing { - for file_path in log/* - do + for file_path in log/*; do if [[ -d ${file_path} ]] && [[ -f ${file_path}/summary.log ]]; then - cat ${file_path}/summary.log >> ${summaryLog} + cat ${file_path}/summary.log >>$GITHUB_STEP_SUMMARY fi done } -function generate_html_overview { - Test_Info_Title="Test Branch Commit ID " - Test_Info="${MR_source_branch} ${ghprbActualCommit} " - - cat >>${WORKSPACE}/report.html < -
-

GenAIEval Tests - [ Job-${BUILD_NUMBER} ]

-

Test Status: ${JOB_STATUS}

-

Summary

- - - - ${Test_Info_Title} - - - - ${Test_Info} - -
Repo
GenAIEval
-eof -} - -function generate_results { - cat >>${WORKSPACE}/report.html <Performance - - - - - - - - - -eof - - devices=$(cat ${summaryLog} | cut -d';' -f1 | awk '!a[$0]++') - for device in ${devices[@]}; do - models=$(cat ${summaryLog} | grep "${device};" | cut -d';' -f2 | awk '!a[$0]++') - for model in ${models[@]}; do - tasks=$(cat ${summaryLog} | grep "${device};${model};" | cut -d';' -f3 | awk '!a[$0]++') - for task in ${tasks[@]}; do - datasets=$(cat ${summaryLog} | grep "${device};${model};${task};" | cut -d';' -f4 | awk '!a[$0]++') - for dataset in ${datasets[@]}; do - benchmark_pattern="${device};${model};${task};${dataset};" - acc=$(cat ${summaryLog} | grep "${benchmark_pattern}" | cut -d';' -f5 | awk '!a[$0]++') - acc_last=nan - if [ $(cat ${summaryLogLast} | grep -c "${benchmark_pattern}") != 0 ]; then - acc_last=$(cat ${summaryLogLast} | grep "${benchmark_pattern}" | cut -d';' -f5 | awk '!a[$0]++') - fi - generate_core - done - done - done - done - cat >>${WORKSPACE}/report.html < -eof -} - -function generate_core { - echo "" >>${WORKSPACE}/report.html - echo | awk -v acc=${acc} -v acc_l=${acc_last} ' - function show_benchmark(a) { - if(a ~/[1-9]/) { - printf("\n",a); - }else { - printf("\n"); - } - } - function compare_new_last(a,b){ - if(a ~/[1-9]/ && b ~/[1-9]/) { - target = b / a; - if(target >= 0.945) { - status_png = "background-color:#90EE90"; - }else { - status_png = "background-color:#FFD2D2"; - job_status = "fail" - } - printf("", status_png, target); - }else{ - if(a == ""){ - job_status = "fail" - status_png = "background-color:#FFD2D2"; - printf("", status_png); - }else{ - printf(""); - } - } - } - BEGIN { - job_status = "pass" - }{ - // current - show_benchmark(acc) - // Last - printf("\n") - show_benchmark(acc_l) - // current vs last - printf("\n"); - compare_new_last(acc,acc_l) - printf("\n"); - } END{ - printf("\n%s", job_status); - } - ' >>${WORKSPACE}/report.html - job_state=$(tail -1 ${WORKSPACE}/report.html) - sed -i '$s/.*//' ${WORKSPACE}/report.html - if [ ${job_state} == 'fail' ]; then - echo "is_perf_reg=true" >> "$GITHUB_ENV" - fi -} - -function generate_html_head { - cat >${WORKSPACE}/report.html < - - - - - - - Daily Tests - TensorFlow - Jenkins - - -eof -} - -function generate_html_footer { - cat >>${WORKSPACE}/report.html < - - -eof +function main { + generate_header + preprocessing } main diff --git a/.github/workflows/scripts/models/model_test.sh b/.github/workflows/scripts/models/model_test.sh index 9b1d4a8e..a593d4ef 100644 --- a/.github/workflows/scripts/models/model_test.sh +++ b/.github/workflows/scripts/models/model_test.sh @@ -5,8 +5,11 @@ set -o pipefail set -x -source /GenAIEval/.github/workflows/scripts/change_color git config --global --add safe.directory /GenAIEval + +export TQDM_POSITION=-1 # fix progress bar on tty mode +export TQDM_MININTERVAL=60 # set refresh every 60s + # get parameters PATTERN='[-a-zA-Z0-9_]*=' PERF_STABLE_CHECK=true @@ -33,7 +36,7 @@ main() { "code-generation") working_dir="/GenAIEval/evals/evaluation/bigcode_evaluation_harness/examples";; *) - echo "Not suppotted task"; exit 1;; + echo "Not supported task"; exit 1;; esac if [[ ${model} == *"opt"* ]]; then pretrained="facebook/${model}" @@ -47,25 +50,12 @@ main() { fi log_dir="/log/${device}/${model}" mkdir -p ${log_dir} - $BOLD_YELLOW && echo "-------- evaluation start --------" && $RESET run_benchmark cp ${log_dir}/${device}-${tasks}-${model}-${datasets}.log /GenAIEval/ } -function prepare() { - ## prepare env - cd ${working_dir} - echo "Working in ${working_dir}" - echo -e "\nInstalling model requirements..." - if [ -f "requirements.txt" ]; then - python -m pip install -r requirements.txt - pip list - else - echo "Not found requirements.txt file." - fi -} - function run_benchmark() { + echo "::group::evaluation start" cd ${working_dir} overall_log="${log_dir}/${device}-${tasks}-${model}-${datasets}.log" python main.py \ @@ -74,13 +64,14 @@ function run_benchmark() { --tasks ${datasets} \ --device ${device} \ --batch_size 112 2>&1 | tee ${overall_log} + echo "::endgroup::" - echo "print log content:" - cat ${overall_log} status=$? if [ ${status} != 0 ]; then - echo "Evaluation process returned non-zero exit code." + echo "::error::Evaluation process returned non-zero exit code!" exit 1 + else + echo "Evaluation process completed successfully!" fi }
DeviceTasksModelDatasetsVSAccuracy
${device}${model}${task}${dataset}New%.2f%.2f
Last
New/Last