-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: amd-pworfolk <[email protected]>
- Loading branch information
1 parent
d263bba
commit ee84439
Showing
16 changed files
with
475 additions
and
150 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
name: "Test Lemonade Server" | ||
description: Launch Lemonade Server and test the endpoints | ||
inputs: | ||
conda_env: | ||
required: true | ||
load_command: | ||
required: true | ||
amd_oga: | ||
required: false | ||
default: "" | ||
description: "Location of the OGA for RyzenAI NPU install directory on disk" | ||
runs: | ||
using: "composite" | ||
steps: | ||
- name: Ensure the Lemonade serer works properly | ||
shell: PowerShell | ||
run: | | ||
$Env:AMD_OGA = "${{ inputs.amd_oga }}" | ||
$outputFile = "output.log" | ||
$errorFile = "error.log" | ||
$serverProcess = Start-Process -FilePath "conda" -ArgumentList "run ${{ inputs.conda_env }} lemonade -d .\ci-cache ${{ inputs.load_command }} serve --max-new-tokens 10" -RedirectStandardOutput $outputFile -RedirectStandardError $errorFile -PassThru -NoNewWindow | ||
Write-Host "Wait for 30 seconds to let the server come up" | ||
Start-Sleep -Seconds 30 | ||
Write-Host "Check if server process successfully launched" | ||
$serverRunning = Get-Process -Id $serverProcess.Id -ErrorAction SilentlyContinue | ||
if (-not $serverRunning) { | ||
Write-Host "Error: Server process isn't running, even though we just tried to start it!" | ||
Write-Host "Standard Output:" | ||
Get-Content $outputFile | ||
Write-Host "Standard Error:" | ||
Get-Content $errorFile | ||
exit 1 | ||
} else { | ||
Write-Host "Server process is alive." | ||
} | ||
Write-Host "Wait for the server port to come up" | ||
while ($true) { | ||
$llmPortCheck = Test-NetConnection -ComputerName 127.0.0.1 -Port 8000 | ||
if (-not $llmPortCheck.TcpTestSucceeded) { | ||
Write-Host "LLM server is not yet running on port 8000!" | ||
Write-Host "Standard Output:" | ||
Get-Content $outputFile | ||
Write-Host "Standard Error:" | ||
Get-Content $errorFile | ||
} else { | ||
Write-Host "LLM server is running on port 8000." | ||
break | ||
} | ||
Start-Sleep -Seconds 30 | ||
} | ||
Write-Host "Checking the /health endpoint" | ||
$response = Invoke-WebRequest -Uri http://127.0.0.1:8000/health -UseBasicParsing | ||
if ($response.StatusCode -eq 200) { | ||
Write-Output "Good: /health status code is 200" | ||
} else { | ||
Write-Output "Error: /health status code is not 200" | ||
Write-Host "Standard Output:" | ||
Get-Content $outputFile | ||
Write-Host "Standard Error:" | ||
Get-Content $errorFile | ||
exit 1 | ||
} | ||
$jsonContent = $response.Content | ConvertFrom-Json | ||
if ($jsonContent) { | ||
Write-Output "Good: /health JSON content is not empty: $jsonContent" | ||
} else { | ||
Write-Output "Error: /health JSON content is empty" | ||
Write-Host "Standard Output:" | ||
Get-Content $outputFile | ||
Write-Host "Standard Error:" | ||
Get-Content $errorFile | ||
exit 1 | ||
} | ||
Write-Host "Checking the /ws (streaming generation) endpoint" | ||
# Define the WebSocket URI | ||
$uri = [System.Uri]::new("ws://127.0.0.1:8000/ws") | ||
# Create a new ClientWebSocket instance | ||
$webSocket = [System.Net.WebSockets.ClientWebSocket]::new() | ||
# Connect to the WebSocket server | ||
$webSocket.ConnectAsync($uri, [System.Threading.CancellationToken]::None).Wait() | ||
# Define the message to send | ||
$message = "Hello, WebSocket!" | ||
$buffer = [System.Text.Encoding]::UTF8.GetBytes($message) | ||
$segment = [System.ArraySegment[byte]]::new($buffer) | ||
# Send the message | ||
$webSocket.SendAsync($segment, [System.Net.WebSockets.WebSocketMessageType]::Text, $true, [System.Threading.CancellationToken]::None).Wait() | ||
# Buffer to store the response | ||
$responseBuffer = New-Object byte[] 1024 | ||
$responseSegment = [System.ArraySegment[byte]]::new($responseBuffer) | ||
# Variable to store the complete response | ||
$response = "" | ||
# Receive the streaming response | ||
do { | ||
$result = $webSocket.ReceiveAsync($responseSegment, [System.Threading.CancellationToken]::None).Result | ||
$response += [System.Text.Encoding]::UTF8.GetString($responseBuffer, 0, $result.Count) | ||
} while ($response -notlike "*</s>*") | ||
# Close the WebSocket connection | ||
$webSocket.CloseAsync([System.Net.WebSockets.WebSocketCloseStatus]::NormalClosure, "Closing", [System.Threading.CancellationToken]::None).Wait() | ||
# Check if the response is not empty | ||
if ($response -and $response -notlike "</s>") { | ||
Write-Output "Response is not empty: $response" | ||
} else { | ||
Write-Output "Response is empty or only contains the end marker: $response" | ||
Write-Host "Standard Output:" | ||
Get-Content $outputFile | ||
Write-Host "Standard Error:" | ||
Get-Content $errorFile | ||
exit 1 | ||
} | ||
Write-Host "Checking the /stats endpoint" | ||
$response = Invoke-WebRequest -Uri http://127.0.0.1:8000/stats -UseBasicParsing | ||
if ($response.StatusCode -eq 200) { | ||
Write-Output "Good: /stats status code is 200" | ||
} else { | ||
Write-Output "Error: /stats status code is not 200" | ||
Write-Host "Standard Output:" | ||
Get-Content $outputFile | ||
Write-Host "Standard Error:" | ||
Get-Content $errorFile | ||
exit 1 | ||
} | ||
$jsonContent = $response.Content | ConvertFrom-Json | ||
if ($jsonContent) { | ||
Write-Output "Good: /stats JSON content is not empty: $jsonContent" | ||
} else { | ||
Write-Output "Error: /stats JSON content is empty" | ||
Write-Host "Standard Output:" | ||
Get-Content $outputFile | ||
Write-Host "Standard Error:" | ||
Get-Content $errorFile | ||
exit 1 | ||
} | ||
Write-Host "Close the server process" | ||
Stop-Process -Id $serverProcess.Id |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# OnnxRuntime GenAI (OGA) for iGPU and CPU | ||
|
||
onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running ONNX LLMs: https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file | ||
|
||
## Installation | ||
|
||
To install: | ||
|
||
1. `conda create -n oga-igpu python=3.9` | ||
1. `conda activate oga-igpu` | ||
1. `pip install -e .[llm-oga-igpu]` | ||
- Note: don't forget the `[llm-oga-igpu]` at the end, this is what installs ort-genai | ||
1. Get models: | ||
- The oga-load tool can download models from Hugging Face and build ONNX files using oga model_builder. Models can be quantized and optimized for both igpu and cpu. | ||
- Download and build ONNX model files: | ||
- `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4` | ||
- `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device cpu --dtype int4` | ||
- The ONNX model files will be stored in the respective subfolder of the lemonade cache folder and will be reused in future oga-load calls: | ||
- `oga_models\microsoft_phi-3-mini-4k-instruct\dml-int4` | ||
- `oga_models\microsoft_phi-3-mini-4k-instruct\cpu-int4` | ||
- The ONNX model build process can be forced to run again, overwriting the above cache, by using the --force flag: | ||
`lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 --force` | ||
- Transformer model architectures supported by the model_builder tool include many popular state-of-the-art models: | ||
- Gemma | ||
- LLaMa | ||
- Mistral | ||
- Phi | ||
- Qwen | ||
- Nemotron | ||
- For the full list of supported models, please see the | ||
[model_builder documentation](https://github.com/microsoft/onnxruntime-genai/blob/main/src/python/py/models/README.md). | ||
- The following quantizations are supported for automatically building ONNXRuntime GenAI model files from the Hugging Face repository: | ||
- cpu: fp32, int4 | ||
- igpu: fp16, int4 | ||
1. Directory structure: | ||
- The model_builder tool caches Hugging Face files and temporary ONNX external data files in `<LEMONADE CACHE>\model_builder` | ||
- The output from model_builder is stored in `<LEMONADE_CACHE>\oga_models\<MODELNAME>\<SUBFOLDER>` | ||
- `MODELNAME` is the Hugging Face checkpoint name where any '/' is mapped to an '_' and everything is lower case | ||
- `SUBFOLDER` is `<EP>-<DTYPE>`, where `EP` is the execution provider (`dml` for igpu, `cpu` for cpu, and `npu` for npu) and `DTYPE` is the datatype | ||
- If the --int4-block-size flag is used then `SUBFOLDER` is` <EP>-<DTYPE>-block-<SIZE>` where `SIZE` is the specified block size | ||
- Other ONNX models in the format required by onnxruntime-genai can be loaded in lemonade if placed in the `<LEMONADE_CACHE>\oga_models` folder. | ||
Use the -i and --subfolder flags to specify the folder and subfolder: | ||
`lemonade -i my_model_name --subfolder my_subfolder --device igpu --dtype int4 oga-load` | ||
Lemonade will expect the ONNX model files to be located in `<LEMONADE_CACHE>\oga_models\my_model_name\my_subfolder` | ||
## Usage | ||
|
||
Prompt: `lemonade -i meta-llama/Llama-3.2-1B-Instruct oga-load --device igpu --dtype int4 llm-prompt -p "My thoughts are" --max-new-tokens 50` | ||
|
||
Serving: `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --dtype int4 --device igpu serve --max-new-tokens 100` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.