Skip to content
13 changes: 10 additions & 3 deletions pipelines/azure-benchmarks.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
trigger: none
trigger:
branches:
include:
- main

pr: none

stages:
Expand All @@ -14,6 +18,11 @@ stages:
- job: RunBenchmark
displayName: 'Run Copilot Benchmarks'
steps:
- task: UsePythonVersion@0
displayName: 'Use Python 3.11'
inputs:
versionSpec: '3.11'

- task: AzureCLI@2
displayName: 'Run Copilot Benchmarks script'
inputs:
Expand All @@ -22,5 +31,3 @@ stages:
scriptType: 'pscore'
scriptLocation: 'scriptPath'
scriptPath: $(Build.SourcesDirectory)/pipelines/scripts/Invoke-CopilotBenchmarks.ps1
arguments: >
-BuildId $(OfficialBuildId)
123 changes: 117 additions & 6 deletions pipelines/scripts/Invoke-CopilotBenchmarks.ps1
Original file line number Diff line number Diff line change
@@ -1,12 +1,123 @@
<#
.SYNOPSIS
Installs MSBench CLI in a local virtual environment and runs a Copilot Azure benchmark.

.DESCRIPTION
This script is executed by the Azure DevOps benchmark pipeline to run a single Azure benchmark
instance using the github-copilot-cli agent.

The script creates a Python virtual environment in the working directory, installs MSBench CLI
from the MicrosoftSweBench Azure Artifacts feed, validates required inputs, and invokes:
msbench-cli run --agent github-copilot-cli --benchmark <benchmark> --model <model>

Required environment variable:
- GITHUB_MCP_SERVER_TOKEN

MSBench CLI reference:
- https://github.com/devdiv-microsoft/MicrosoftSweBench/wiki

.PARAMETER Benchmark
Benchmark identifier

.PARAMETER Model
Model name passed to msbench-cli via --model.
Default: claude-sonnet-4.5-autodev-test

.PARAMETER NoWait
Whether to add --no-wait to the run command.
Accepted values: "true" or "false" (case-insensitive).
Default: true

.EXAMPLE
PS> ./Invoke-CopilotBenchmarks.ps1

Runs benchmark azure with default model and --no-wait.

.EXAMPLE
PS> ./Invoke-CopilotBenchmarks.ps1 -BenchmarkInstanceId azure.120 -Model "claude-sonnet-4.5-autodev-test" -NoWait "false"

Runs benchmark azure.120 with explicit model and waits for completion.

.NOTES
The pipeline must provide GITHUB_MCP_SERVER_TOKEN and ensure Python is available.

.LINK
https://github.com/devdiv-microsoft/MicrosoftSweBench/wiki
#>

param(
[string]$BuildId
[string]$Benchmark = "azure",
[string]$Model = "claude-sonnet-4.5-autodev-test",
[switch]$NoWait
)

Set-StrictMode -Version Latest
$ErrorActionPreference = "Stop"


if (!$Benchmark) {
throw "Benchmark parameter is required."
}

if (!$Model) {
throw "Model parameter is required."
}

$indexUrl = "https://pkgs.dev.azure.com/devdiv/_packaging/MicrosoftSweBench/pypi/simple/"
$vaultName = "kv-msbench-eval-azuremcp"
$secretName = "azure-eval-gh-pat"

# pull the azure-eval-gh-pat secret from KeyVault using Azure CLI
try {
Write-Host "Retrieving GitHub PAT from KeyVault $vaultName secret $secretName"
$pat = az keyvault secret show --vault-name $vaultName --name $secretName --query value -o tsv
if (!$pat) {
throw "Secret $secretName not found in KeyVault $vaultName."
}

$env:GITHUB_MCP_SERVER_TOKEN = $pat
}
catch {
throw "Failed to retrieve GitHub PAT from KeyVault: $_"
}

Write-Host "Benchmark: $Benchmark"
Write-Host "Model: $Model"
Write-Host "NoWait: $NoWait"

$pythonCommand = Get-Command python
Write-Host "Using python from: $($pythonCommand.Path). Version: $(python --version)"

Write-Host "Install/upgrade pip"
python -m pip install --upgrade pip

Write-Host "Installing artifact authentication dependencies"
python -m pip install keyring artifacts-keyring

Write-Host "Checking MSBench CLI versions from feed"
python -m pip index versions msbench-cli --index-url $indexUrl

Write-Host "Installing/upgrading MSBench CLI"
python -m pip install --upgrade msbench-cli --index-url $indexUrl

Write-Host "MSBench CLI version"
& 'msbench-cli' version

$runArgs = @(
"run",
"--agent", "github-copilot-cli",
"--benchmark", $Benchmark,
"--model", $Model,
"--env", "GITHUB_MCP_SERVER_TOKEN"
)

if ($NoWait) {
$runArgs += "--no-wait"
}

# Install MSBench CLI
Write-Host "Installing keyring"
pip install keyring artifacts-keyring
Write-Host "Running: msbench-cli $($runArgs -join ' ')"
#msbench-cli @runArgs

Write-Host "Listing key vaults in the resource group"
az keyvault list --resource-group rg-msbench-eval-kv-azure-mcp --query "[].name" -o tsv
if ($LASTEXITCODE -ne 0) {
throw "msbench-cli run failed with exit code $LASTEXITCODE"
}
Loading