Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@

venv/
.venv/
__integration_*__/
__pycache__/
__test_fixtures__/
Expand Down
14 changes: 11 additions & 3 deletions pipelines/azure-benchmarks.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
trigger: none
trigger:
branches:
include:
- main

pr: none

stages:
- stage: RunBenchmark
displayName: 'Run Benchmarks'
pool:
name: $(LINUXPOOL)
image: $(LINUXVMIMAGE)
Expand All @@ -14,6 +19,11 @@ stages:
- job: RunBenchmark
displayName: 'Run Copilot Benchmarks'
steps:
- task: UsePythonVersion@0
displayName: 'Use Python 3.14'
inputs:
versionSpec: '3.14'

- task: AzureCLI@2
displayName: 'Run Copilot Benchmarks script'
inputs:
Expand All @@ -22,5 +32,3 @@ stages:
scriptType: 'pscore'
scriptLocation: 'scriptPath'
scriptPath: $(Build.SourcesDirectory)/pipelines/scripts/Invoke-CopilotBenchmarks.ps1
arguments: >
-BuildId $(OfficialBuildId)
139 changes: 131 additions & 8 deletions pipelines/scripts/Invoke-CopilotBenchmarks.ps1
Original file line number Diff line number Diff line change
@@ -1,12 +1,135 @@
param(
[string]$BuildId
)
<#
.SYNOPSIS
Installs MSBench CLI and runs a Copilot Azure benchmark.

.DESCRIPTION
This script runs in Azure DevOps under an AzureCLI@2 task with federated authentication.
It acquires an Azure DevOps AAD token from the already-authenticated az CLI session,
constructs an authenticated pip index URL, installs MSBench CLI from the
MicrosoftSweBench Azure Artifacts feed, and invokes:
msbench-cli run --agent github-copilot-cli --benchmark <benchmark> --model <model>

MSBench CLI reference:
- https://github.com/devdiv-microsoft/MicrosoftSweBench/wiki

# Install MSBench CLI
Write-Host "Installing keyring"
pip install keyring artifacts-keyring
.PARAMETER Benchmark
Benchmark identifier. Default: azure

Write-Host "Listing key vaults in the resource group"
az keyvault list --resource-group rg-msbench-eval-kv-azure-mcp --query "[].name" -o tsv
.PARAMETER Model
Model identifier. Default: claude-sonnet-4.5-autodev-test

.PARAMETER NoWait
Whether to add --no-wait to the run command.

.LINK
https://github.com/devdiv-microsoft/MicrosoftSweBench/wiki
#>

param(
[string]$Benchmark = "azure",
[string]$Model = "claude-sonnet-4.5-autodev-test",
[switch]$NoWait
)

Set-StrictMode -Version Latest
$ErrorActionPreference = "Stop"

if (!$Benchmark) {
throw "Benchmark parameter is required."
}

if (!$Model) {
throw "Model parameter is required."
}

$vaultName = "kv-msbench-eval-azuremcp"
$secretName = "azure-eval-gh-pat"

Write-Host "Benchmark: $Benchmark"
Write-Host "Model: $Model"
Write-Host "NoWait: $NoWait"

$pipelineRun = $env:TF_BUILD -eq "True"

# --- Retrieve GitHub PAT from KeyVault ---
try {
Write-Host "Retrieving GitHub PAT from KeyVault $vaultName secret $secretName"
$pat = az keyvault secret show --vault-name $vaultName --name $secretName --query value -o tsv

if (!$pat) {
throw "Secret $secretName not found in KeyVault $vaultName."
}

$env:GITHUB_MCP_SERVER_TOKEN = $pat

# Log the PAT as a secret variable to avoid exposing it in logs
if ($pipelineRun) {
Write-Host "##vso[task.setsecret]$pat"
}
}
catch {
throw "Failed to retrieve GitHub PAT from KeyVault: $_"
}

# --- Authenticate to Azure DevOps Artifacts feed via AAD token ---
Write-Host "Acquiring Azure DevOps AAD token for feed authentication"
$adoResourceId = "499b84ac-1321-427f-aa17-267ca6975798"
$adoAccessToken = az account get-access-token --resource $adoResourceId --query accessToken -o tsv

if (!$adoAccessToken) {
throw "Failed to acquire Azure DevOps AAD access token. Ensure the AzureCLI@2 task has a valid service connection."
}

$encodedToken = [System.Uri]::EscapeDataString($adoAccessToken)
$indexUrl = "https://vsts:$encodedToken@pkgs.dev.azure.com/devdiv/_packaging/MicrosoftSweBench/pypi/simple/"
Write-Host "Authenticated pip index URL constructed."

# Log the token as a secret variable to avoid exposing it in logs
if ($pipelineRun) {
Write-Host "##vso[task.setsecret]$adoAccessToken"
Write-Host "##vso[task.setsecret]$encodedToken"
}

$pythonCommand = Get-Command python
Write-Host "Using python from: $($pythonCommand.Path). Version: $(python --version 2>&1)"

Write-Host "Install/upgrade pip"
python -m pip install --upgrade pip
if ($LASTEXITCODE -ne 0) {
throw "pip install/upgrade failed with exit code $LASTEXITCODE"
}

Write-Host "Checking MSBench CLI versions from feed"
python -m pip index versions msbench-cli --no-input --index-url $indexUrl
if ($LASTEXITCODE -ne 0) {
throw "pip index versions failed with exit code $LASTEXITCODE"
}

Write-Host "Installing/upgrading MSBench CLI"
python -m pip install --upgrade msbench-cli --no-input --index-url $indexUrl
if ($LASTEXITCODE -ne 0) {
throw "pip install msbench-cli failed with exit code $LASTEXITCODE"
}

Write-Host "MSBench CLI version"
& 'msbench-cli' version
if

$runArgs = @(
"run",
"--agent", "github-copilot-cli",
"--benchmark", $Benchmark,
"--model", $Model,
"--env", "GITHUB_MCP_SERVER_TOKEN"
)

if ($NoWait) {
$runArgs += "--no-wait"
}

Write-Host "Running: msbench-cli $($runArgs -join ' ')"
& 'msbench-cli' @runArgs

if ($LASTEXITCODE -ne 0) {
throw "msbench-cli run failed with exit code $LASTEXITCODE"
}
Loading