Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@

venv/
.venv/
__integration_*__/
__pycache__/
__test_fixtures__/
Expand Down
19 changes: 16 additions & 3 deletions pipelines/azure-benchmarks.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
trigger: none
trigger:
branches:
include:
- main

pr: none

stages:
- stage: RunBenchmark
displayName: 'Run Benchmarks'
pool:
name: $(LINUXPOOL)
image: $(LINUXVMIMAGE)
Expand All @@ -14,6 +19,16 @@ stages:
- job: RunBenchmark
displayName: 'Run Copilot Benchmarks'
steps:
- task: UsePythonVersion@0
displayName: 'Use Python 3.12'
inputs:
versionSpec: '3.12'

- task: PipAuthenticate@1
displayName: 'Authenticate pip with MicrosoftSweBench feed'
inputs:
artifactFeeds: 'internal/MicrosoftSweBench'

- task: AzureCLI@2
displayName: 'Run Copilot Benchmarks script'
inputs:
Expand All @@ -22,5 +37,3 @@ stages:
scriptType: 'pscore'
scriptLocation: 'scriptPath'
scriptPath: $(Build.SourcesDirectory)/pipelines/scripts/Invoke-CopilotBenchmarks.ps1
arguments: >
-BuildId $(OfficialBuildId)
124 changes: 116 additions & 8 deletions pipelines/scripts/Invoke-CopilotBenchmarks.ps1
Original file line number Diff line number Diff line change
@@ -1,12 +1,120 @@
param(
[string]$BuildId
)
<#
.SYNOPSIS
Installs MSBench CLI and runs a Copilot Azure benchmark.

.DESCRIPTION
This script runs in Azure DevOps under an AzureCLI@2 task with federated authentication.
Feed authentication is handled by a preceding PipAuthenticate@1 task that sets
PIP_EXTRA_INDEX_URL for the azure-sdk/internal/MicrosoftSweBench feed.
The script retrieves a GitHub PAT from KeyVault, installs MSBench CLI, and invokes:
msbench-cli run --agent github-copilot-cli --benchmark <benchmark> --model <model>

MSBench CLI reference:
- https://github.com/devdiv-microsoft/MicrosoftSweBench/wiki

# Install MSBench CLI
Write-Host "Installing keyring"
pip install keyring artifacts-keyring
.PARAMETER Benchmark
Benchmark identifier. Default: azure

Write-Host "Listing key vaults in the resource group"
az keyvault list --resource-group rg-msbench-eval-kv-azure-mcp --query "[].name" -o tsv
.PARAMETER Model
Model identifier. Default: claude-sonnet-4.5-autodev-test

.PARAMETER NoWait
Whether to add --no-wait to the run command.

.LINK
https://github.com/devdiv-microsoft/MicrosoftSweBench/wiki
#>

param(
[string]$Benchmark = "azure",
[string]$Model = "claude-sonnet-4.5-autodev-test",
[switch]$NoWait
)

Set-StrictMode -Version Latest
$ErrorActionPreference = "Stop"

if (!$Benchmark) {
throw "Benchmark parameter is required."
}

if (!$Model) {
throw "Model parameter is required."
}

$vaultName = "kv-msbench-eval-azuremcp"
$secretName = "azure-eval-gh-pat"

Write-Host "Benchmark: $Benchmark"
Write-Host "Model: $Model"
Write-Host "NoWait: $NoWait"

$pipelineRun = $env:TF_BUILD -eq "True"

# --- Retrieve GitHub PAT from KeyVault ---
try {
Write-Host "Retrieving GitHub PAT from KeyVault $vaultName secret $secretName"
$pat = az keyvault secret show --vault-name $vaultName --name $secretName --query value -o tsv

if (!$pat) {
throw "Secret $secretName not found in KeyVault $vaultName."
}

$env:GITHUB_MCP_SERVER_TOKEN = $pat

# Log the PAT as a secret variable to avoid exposing it in logs
if ($pipelineRun) {
Write-Host "##vso[task.setsecret]$pat"
}
}
catch {
throw "Failed to retrieve GitHub PAT from KeyVault: $_"
}

# --- Feed auth is handled by the PipAuthenticate@1 pipeline task ---
# PipAuthenticate sets PIP_EXTRA_INDEX_URL for the azure-sdk/internal/MicrosoftSweBench feed.
if ($env:PIP_EXTRA_INDEX_URL) {
Write-Host "PIP_EXTRA_INDEX_URL is set (feed auth configured by PipAuthenticate task)"
} else {
Write-Warning "PIP_EXTRA_INDEX_URL is not set. Feed authentication may fail. Ensure PipAuthenticate@1 runs before this script."
}

$pythonCommand = Get-Command python
Write-Host "Using python from: $($pythonCommand.Path). Version: $(python --version 2>&1)"

Write-Host "Install/upgrade pip"
python -m pip install --upgrade pip
if ($LASTEXITCODE -ne 0) {
throw "pip install/upgrade failed with exit code $LASTEXITCODE"
}

Write-Host "Installing/upgrading MSBench CLI"
python -m pip install msbench-cli --no-input
if ($LASTEXITCODE -ne 0) {
throw "pip install msbench-cli failed with exit code $LASTEXITCODE"
}

Write-Host "MSBench CLI version"
& 'msbench-cli' version
if ($LASTEXITCODE -ne 0) {
throw "msbench-cli version failed with exit code $LASTEXITCODE"
}

$runArgs = @(
"run",
"--agent", "github-copilot-cli",
"--benchmark", $Benchmark,
"--model", $Model,
"--env", "GITHUB_MCP_SERVER_TOKEN"
)

if ($NoWait) {
$runArgs += "--no-wait"
}

Write-Host "Running: msbench-cli $($runArgs -join ' ')"
& 'msbench-cli' @runArgs

if ($LASTEXITCODE -ne 0) {
throw "msbench-cli run failed with exit code $LASTEXITCODE"
}
Loading