From 47a178b518cb8929b308959174b16fca2bef2cb5 Mon Sep 17 00:00:00 2001 From: Yifan Li <109183385+yf711@users.noreply.github.com> Date: Wed, 15 May 2024 21:38:52 -0700 Subject: [PATCH] [EP Perf] Fix on EP Perf (#20683) ### Description * Partially revert [previous change](https://github.com/microsoft/onnxruntime/pull/19804), and * Redo concurrency_test_result parser outside of post.py * Add support of syncing memtest result to db ### Motivation and Context To fix the error when CI is running on two model groups. - When running on two model groups, the [previous change](https://github.com/microsoft/onnxruntime/pull/19804) wrongly navigates two levels up in the directory after running one model group, while one level is needed. After that, the script can't find another model group. - Running on one model group can't repro the issue --- .../perf/parse_mem_concurrency_test.py | 132 ++++++++++++++++++ .../python/tools/tensorrt/perf/post.py | 61 ++------ ...linux-gpu-tensorrt-daily-perf-pipeline.yml | 39 ++++-- 3 files changed, 173 insertions(+), 59 deletions(-) create mode 100644 onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py diff --git a/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py b/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py new file mode 100644 index 0000000000000..492de13fb42b5 --- /dev/null +++ b/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py @@ -0,0 +1,132 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import csv +import datetime +import os +import re + +import pandas as pd +from azure.kusto.data import KustoConnectionStringBuilder +from azure.kusto.ingest import QueuedIngestClient +from post import get_identifier, parse_arguments, write_table + + +def parse_valgrind_log(input_path, output_path, keywords): + is_definitely_lost = False + is_ort_trt_related = False + buffer = [] + leak_block = None + leak_bytes = None + keyword = None + results = [] + + with open(input_path) as file: + lines = file.readlines() + + for line in lines: + line = line.strip() # noqa: PLW2901 + # Remove "==xxxxx==" pattern from the line + line = line.split("==")[-1].strip() # noqa: PLW2901 + + if "blocks are definitely lost in loss" in line: + is_definitely_lost = True + # Extract LeakBlock and LeakBytes + match = re.search(r"([\d,]+) byte[s]? in ([\d,]+) block[s]?", line) + if match: + leak_bytes = match.group(1).replace(",", "") + leak_block = match.group(2).replace(",", "") + continue + + if is_definitely_lost: + if line: + buffer.append(line) + for word in keywords: + if word in line: + is_ort_trt_related = True + keyword = word + break + + # End of section + if is_definitely_lost and not line: + if is_ort_trt_related: + results.append((keyword, leak_block, leak_bytes, "\n".join(buffer))) + # Reset var + is_definitely_lost = False + is_ort_trt_related = False + buffer = [] + leak_block = None + leak_bytes = None + keyword = None + + # Writing results to CSV + with open(output_path, "w", newline="") as csvfile: + csvwriter = csv.writer(csvfile) + csvwriter.writerow(["Keyword", "LeakBlock", "LeakBytes", "ValgrindMessage"]) + for entry in results: + csvwriter.writerow([entry[0], entry[1], entry[2], entry[3]]) + + +def parse_concurrency_test_log(input_path, output_path): + with open(input_path) as log_file: + log_content = log_file.read() + + failed_cases_section = log_content.split("Failed Test Cases:")[1] + + # passed = 1 if no failed test cases + if failed_cases_section.strip() == "": + passed = 1 + else: + passed = 0 + + with open(output_path, "w", newline="") as csv_file: + csv_writer = csv.writer(csv_file) + csv_writer.writerow(["Passed", "Log"]) + csv_writer.writerow([passed, log_content]) + + +if __name__ == "__main__": + args = parse_arguments() + + # connect to database + kcsb_ingest = KustoConnectionStringBuilder.with_az_cli_authentication(args.kusto_conn) + ingest_client = QueuedIngestClient(kcsb_ingest) + identifier = get_identifier( + args.commit_datetime, args.commit_hash, args.trt_version, args.branch, args.use_tensorrt_oss_parser + ) + upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0) + + try: + result_mem_test_path = args.report_folder + os.chdir(result_mem_test_path) + # Parse mem_test log + logs = ["valgrind.log", "concurrency_test.log"] + csv_paths = ["mem_test.csv", "concurrency_test.csv"] + for log, csv_path in zip(logs, csv_paths): + if os.path.exists(log): + print(f"{identifier}: Parsing {log}") + if log == logs[0]: + parse_valgrind_log(log, csv_path, ["TensorrtExecutionProvider", "TensorRT"]) + else: + parse_concurrency_test_log(log, csv_path) + + # Upload to db + for csv_path, db_table_name in zip(csv_paths, ["ep_valgrind_record", "ep_concurrencytest_record"]): + if os.path.exists(csv_path): + table = pd.read_csv(csv_path) + write_table( + ingest_client, + args.database, + table, + db_table_name, + upload_time, + identifier, + args.branch, + args.commit_hash, + args.commit_datetime, + ) + print(f"{identifier}: {csv_path} is synced to db") + + except Exception as e: + print(str(e)) diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py index fe941096e2fe8..9b78743d7e751 100644 --- a/onnxruntime/python/tools/tensorrt/perf/post.py +++ b/onnxruntime/python/tools/tensorrt/perf/post.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import argparse -import csv import datetime import os import sys @@ -421,11 +420,10 @@ def main(): upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0) try: - # Load EP Perf test results from /result result_file = args.report_folder - result_perf_test_path = os.path.join(result_file, "result") - folders = os.listdir(result_perf_test_path) - os.chdir(result_perf_test_path) + + folders = os.listdir(result_file) + os.chdir(result_file) tables = [ fail_name, @@ -448,13 +446,13 @@ def main(): for model_group in folders: os.chdir(model_group) csv_filenames = os.listdir() - for csv_file in csv_filenames: - table = pd.read_csv(csv_file) - if session_name in csv_file: + for csv in csv_filenames: + table = pd.read_csv(csv) + if session_name in csv: table_results[session_name] = pd.concat( [table_results[session_name], get_session(table, model_group)], ignore_index=True ) - elif specs_name in csv_file: + elif specs_name in csv: table_results[specs_name] = pd.concat( [ table_results[specs_name], @@ -462,12 +460,12 @@ def main(): ], ignore_index=True, ) - elif fail_name in csv_file: + elif fail_name in csv: table_results[fail_name] = pd.concat( [table_results[fail_name], get_failures(table, model_group)], ignore_index=True, ) - elif latency_name in csv_file: + elif latency_name in csv: table_results[memory_name] = pd.concat( [table_results[memory_name], get_memory(table, model_group)], ignore_index=True, @@ -477,11 +475,11 @@ def main(): [table_results[latency_name], get_latency(table, model_group)], ignore_index=True, ) - elif status_name in csv_file: + elif status_name in csv: table_results[status_name] = pd.concat( [table_results[status_name], get_status(table, model_group)], ignore_index=True ) - elif op_metrics_name in csv_file: + elif op_metrics_name in csv: table = table.assign(Group=model_group) table_results[op_metrics_name] = pd.concat( [table_results[op_metrics_name], table], ignore_index=True @@ -515,43 +513,6 @@ def main(): args.commit_datetime, ) - # Load concurrency test results - result_mem_test_path = os.path.join(result_file, "result_mem_test") - os.chdir(result_mem_test_path) - log_path = "concurrency_test.log" - if os.path.exists(log_path): - print("Generating concurrency test report") - with open(log_path) as log_file: - log_content = log_file.read() - - failed_cases_section = log_content.split("Failed Test Cases:")[1] - - # passed = 1 if no failed test cases - if failed_cases_section.strip() == "": - passed = 1 - else: - passed = 0 - - csv_path = "concurrency_test.csv" - with open(csv_path, "w", newline="") as csv_file: - csv_writer = csv.writer(csv_file) - csv_writer.writerow(["Passed", "Log"]) - csv_writer.writerow([passed, log_content]) - - db_table_name = "ep_concurrencytest_record" - table = pd.read_csv(csv_path) - write_table( - ingest_client, - args.database, - table, - db_table_name, - upload_time, - identifier, - args.branch, - args.commit_hash, - args.commit_datetime, - ) - except BaseException as e: print(str(e)) sys.exit(1) diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml index a16647f17280d..7cfff805c3b3c 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml @@ -1,7 +1,7 @@ parameters: - name: PostToDashboard - displayName: Post to Dashboard + displayName: Post EP Perf results to Dashboard type: boolean default: true @@ -30,7 +30,7 @@ parameters: - "partner-models" - name: MemTest - displayName: Run Memory Test and Concurrency Test + displayName: Run Memory and Concurrency Test type: boolean default: true @@ -147,11 +147,27 @@ jobs: workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/' condition: always() - - task: PublishBuildArtifacts@1 - inputs: - pathtoPublish: '$(Build.SourcesDirectory)/Artifact' - artifactName: 'result-$(Build.BuildNumber)' - + - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs' + displayName: 'Install dashboard dependencies' + + - script: | + az --version || { + echo "Azure CLI not found, installing..." + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + } + displayName: 'Check and Install Azure CLI' + + - task: AzureCLI@2 + displayName: 'Parse Memory & Concurrency Test Records and Sync' + inputs: + azureSubscription: AIInfraBuildOnnxRuntimeOSS + scriptLocation: inlineScript + scriptType: bash + inlineScript: | + short_hash=$(git rev-parse --short HEAD) && + commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) && + python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py -r $(Build.SourcesDirectory)/Artifact/result_mem_test -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser) + - ${{ if eq(parameters.PostToDashboard, true) }}: - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs' @@ -165,7 +181,7 @@ jobs: displayName: 'Check and Install Azure CLI' - task: AzureCLI@2 - displayName: 'Post EP Perf Results to Dashboard' + displayName: 'Azure CLI Post to Dashboard' inputs: azureSubscription: AIInfraBuildOnnxRuntimeOSS scriptLocation: inlineScript @@ -173,7 +189,12 @@ jobs: inlineScript: | short_hash=$(git rev-parse --short HEAD) && commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) && - python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser) + python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser) + + - task: PublishBuildArtifacts@1 + inputs: + pathtoPublish: '$(Build.SourcesDirectory)/Artifact' + artifactName: 'result-$(Build.BuildNumber)' - template: templates/component-governance-component-detection-steps.yml parameters :