Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@

venv/
.venv/
__integration_*__/
__pycache__/
__test_fixtures__/
Expand Down
14 changes: 11 additions & 3 deletions pipelines/azure-benchmarks.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
trigger: none
trigger:
branches:
include:
- main

pr: none

stages:
- stage: RunBenchmark
displayName: 'Run Benchmarks'
pool:
name: $(LINUXPOOL)
image: $(LINUXVMIMAGE)
Expand All @@ -14,6 +19,11 @@ stages:
- job: RunBenchmark
displayName: 'Run Copilot Benchmarks'
steps:
- task: UsePythonVersion@0
displayName: 'Use Python 3.14'
inputs:
versionSpec: '3.14'

- task: AzureCLI@2
displayName: 'Run Copilot Benchmarks script'
inputs:
Expand All @@ -22,5 +32,3 @@ stages:
scriptType: 'pscore'
scriptLocation: 'scriptPath'
scriptPath: $(Build.SourcesDirectory)/pipelines/scripts/Invoke-CopilotBenchmarks.ps1
arguments: >
-BuildId $(OfficialBuildId)
111 changes: 105 additions & 6 deletions pipelines/scripts/Invoke-CopilotBenchmarks.ps1
Original file line number Diff line number Diff line change
@@ -1,12 +1,111 @@
<#
.SYNOPSIS
Installs MSBench CLI in a local virtual environment and runs a Copilot Azure benchmark.

.DESCRIPTION
This script is executed by the Azure DevOps benchmark pipeline to run Azure benchmarks using the
github-copilot-cli agent.

The script installs MSBench CLI from the MicrosoftSweBench Azure Artifacts feed and invokes:
msbench-cli run --agent github-copilot-cli --benchmark <benchmark> --model <model>

MSBench CLI reference:
- https://github.com/devdiv-microsoft/MicrosoftSweBench/wiki

.PARAMETER Benchmark
Benchmark identifier

.PARAMETER NoWait
Whether to add --no-wait to the run command.
Default: false

.EXAMPLE
PS> ./Invoke-CopilotBenchmarks.ps1

Runs benchmark azure with default model.

.EXAMPLE
PS> ./Invoke-CopilotBenchmarks.ps1 -BenchmarkInstanceId azure.120 -Model "claude-sonnet-4.5-autodev-test" -NoWait

Runs benchmark azure.120 with explicit model and does not wait for completion.

.LINK
https://github.com/devdiv-microsoft/MicrosoftSweBench/wiki
#>

param(
[string]$BuildId
[string]$Benchmark = "azure",
[string]$Model = "claude-sonnet-4.5-autodev-test",
[switch]$NoWait
)

Set-StrictMode -Version Latest
$ErrorActionPreference = "Stop"


if (!$Benchmark) {
throw "Benchmark parameter is required."
}

if (!$Model) {
throw "Model parameter is required."
}

$indexUrl = "https://pkgs.dev.azure.com/devdiv/_packaging/MicrosoftSweBench/pypi/simple/"
$vaultName = "kv-msbench-eval-azuremcp"
$secretName = "azure-eval-gh-pat"

# pull the azure-eval-gh-pat secret from KeyVault using Azure CLI
try {
Write-Host "Retrieving GitHub PAT from KeyVault $vaultName secret $secretName"
$pat = az keyvault secret show --vault-name $vaultName --name $secretName --query value -o tsv
if (!$pat) {
throw "Secret $secretName not found in KeyVault $vaultName."
}

$env:GITHUB_MCP_SERVER_TOKEN = $pat
}
catch {
throw "Failed to retrieve GitHub PAT from KeyVault: $_"
}

Write-Host "Benchmark: $Benchmark"
Write-Host "Model: $Model"
Write-Host "NoWait: $NoWait"

$pythonCommand = Get-Command python
Write-Host "Using python from: $($pythonCommand.Path). Version: $(python --version)"

Write-Host "Install/upgrade pip"
python -m pip install --upgrade pip

Write-Host "Installing artifact authentication dependencies"
python -m pip install keyring artifacts-keyring

Write-Host "Checking MSBench CLI versions from feed"
python -m pip index versions msbench-cli --index-url $indexUrl

Write-Host "Installing/upgrading MSBench CLI"
python -m pip install msbench-cli --index-url $indexUrl

Write-Host "MSBench CLI version"
& 'msbench-cli' version

$runArgs = @(
"run",
"--agent", "github-copilot-cli",
"--benchmark", $Benchmark,
"--model", $Model,
"--env", "GITHUB_MCP_SERVER_TOKEN"
)

if ($NoWait) {
$runArgs += "--no-wait"
}

# Install MSBench CLI
Write-Host "Installing keyring"
pip install keyring artifacts-keyring
Write-Host "Running: msbench-cli $($runArgs -join ' ')"
& 'msbench-cli' @runArgs

Write-Host "Listing key vaults in the resource group"
az keyvault list --resource-group rg-msbench-eval-kv-azure-mcp --query "[].name" -o tsv
if ($LASTEXITCODE -ne 0) {
throw "msbench-cli run failed with exit code $LASTEXITCODE"
}
Loading