Skip to content

Commit

Permalink
[CI/Build] Adding functionality to reset the node's GPUs before proce…
Browse files Browse the repository at this point in the history
…ssing. (vllm-project#4213)
  • Loading branch information
Alexei-V-Ivanov-AMD authored Apr 25, 2024
1 parent fbf152d commit 7ee82be
Showing 1 changed file with 15 additions and 1 deletion.
16 changes: 15 additions & 1 deletion .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,19 @@ set -ex
# Print ROCm version
rocminfo


echo "reset" > /opt/amdgpu/etc/gpu_state

while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done



# Try building the docker image
docker build -t rocm -f Dockerfile.rocm .

Expand All @@ -14,7 +27,8 @@ trap remove_docker_container EXIT
remove_docker_container

# Run the image
docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
export HIP_VISIBLE_DEVICES=1
docker run --device /dev/kfd --device /dev/dri --network host -e HIP_VISIBLE_DEVICES --name rocm rocm python3 -m vllm.entrypoints.api_server &

# Wait for the server to start
wait_for_server_to_start() {
Expand Down

0 comments on commit 7ee82be

Please sign in to comment.