Skip to content

Commit

Permalink
replace GPUtil with pynvml
Browse files Browse the repository at this point in the history
  • Loading branch information
efajardo-nv committed Jan 3, 2024
1 parent 36f1c19 commit c091731
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 31 deletions.
2 changes: 1 addition & 1 deletion ci/conda/recipes/morpheus/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ outputs:
- {{ pin_compatible('cudatoolkit', min_pin='x.x', max_pin='x') }}
test:
requires:
- gputil
- pynvml
- pytest
- pytest-cov
- pytest-benchmark
Expand Down
1 change: 0 additions & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ dependencies:
- flake8
- gcc_linux-64=11.2
- git-lfs
- gputil
- grpcio
- gxx_linux-64=11.2
- huggingface_hub=0.10.1
Expand Down
1 change: 0 additions & 1 deletion conda/environments/dev_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ dependencies:
- flake8
- gcc_linux-64=11.2
- git-lfs
- gputil
- grpcio
- gxx_linux-64=11.2
- include-what-you-use=0.20
Expand Down
1 change: 0 additions & 1 deletion dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,6 @@ dependencies:
- dill
- elasticsearch==8.9.0
- feedparser=6.0.10
- gputil
- grpcio
- mlflow>=2.2.1,<3
- nb_conda_kernels
Expand Down
1 change: 0 additions & 1 deletion docker/conda/environments/cuda11.8_dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ dependencies:
- git>=2.35.3 # Needed for wildcards on safe.directory
- glog=0.6
- gmock>=1.13.0
- gputil
- grpcio
- gtest>=1.13.0
- gxx_linux-64=11.2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
import json
from os import path

import GPUtil
from pynvml.smi import NVSMI_QUERY_GPU
from pynvml.smi import nvidia_smi

from benchmarks.test_bench_e2e_dfp_pipeline import PIPELINES_CONF

Expand All @@ -32,18 +33,40 @@ def pytest_benchmark_update_json(config, benchmarks, output_json): # pylint:dis

curr_dir = path.dirname(path.abspath(__file__))

gpus = GPUtil.getGPUs()

for i, gpu in enumerate(gpus):
# output_json["machine_info"]["gpu_" + str(i)] = gpu.name
output_json["machine_info"]["gpu_" + str(i)] = {}
output_json["machine_info"]["gpu_" + str(i)]["id"] = gpu.id
output_json["machine_info"]["gpu_" + str(i)]["name"] = gpu.name
output_json["machine_info"]["gpu_" + str(i)]["load"] = f"{gpu.load*100}%"
output_json["machine_info"]["gpu_" + str(i)]["free_memory"] = f"{gpu.memoryFree}MB"
output_json["machine_info"]["gpu_" + str(i)]["used_memory"] = f"{gpu.memoryUsed}MB"
output_json["machine_info"]["gpu_" + str(i)]["temperature"] = f"{gpu.temperature} C"
output_json["machine_info"]["gpu_" + str(i)]["uuid"] = gpu.uuid
query_opts = NVSMI_QUERY_GPU.copy()
nvsmi = nvidia_smi.getInstance()
device_query = nvsmi.DeviceQuery([
query_opts["driver_version"],
query_opts["count"],
query_opts["index"],
query_opts["gpu_name"],
query_opts["gpu_uuid"],
query_opts["memory.total"],
query_opts["memory.used"],
query_opts["memory.free"],
query_opts["utilization.gpu"],
query_opts["utilization.memory"],
query_opts["temperature.gpu"]
])

output_json["machine_info"]["gpu_driver_version"] = device_query["driver_version"]

for gpu in device_query["gpu"]:
gpu_num = gpu["minor_number"]
output_json["machine_info"]["gpu_" + gpu_num] = {}
output_json["machine_info"]["gpu_" + gpu_num]["id"] = gpu_num
output_json["machine_info"]["gpu_" + gpu_num]["name"] = gpu["product_name"]
output_json["machine_info"][
"gpu_" + gpu_num]["utilization"] = f"{gpu['utilization']['gpu_util']}{gpu['utilization']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["total_memory"] = f"{gpu['fb_memory_usage']['total']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["used_memory"] = f"{gpu['fb_memory_usage']['used']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["free_memory"] = f"{gpu['fb_memory_usage']['free']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["temperature"] = f"{gpu['temperature']['gpu_temp']} {gpu['temperature']['unit']}"
output_json["machine_info"]["gpu_" + gpu_num]["uuid"] = gpu["uuid"]

for bench in output_json['benchmarks']:

Expand Down
50 changes: 37 additions & 13 deletions tests/benchmarks/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,49 @@
import typing
from unittest import mock

import GPUtil
from pynvml.smi import NVSMI_QUERY_GPU
from pynvml.smi import nvidia_smi
import pytest
from test_bench_e2e_pipelines import E2E_TEST_CONFIGS


# pylint: disable=unused-argument
def pytest_benchmark_update_json(config, benchmarks, output_json):
gpus = GPUtil.getGPUs()

for i, gpu in enumerate(gpus):
# output_json["machine_info"]["gpu_" + str(i)] = gpu.name
output_json["machine_info"]["gpu_" + str(i)] = {}
output_json["machine_info"]["gpu_" + str(i)]["id"] = gpu.id
output_json["machine_info"]["gpu_" + str(i)]["name"] = gpu.name
output_json["machine_info"]["gpu_" + str(i)]["load"] = f"{gpu.load*100}%"
output_json["machine_info"]["gpu_" + str(i)]["free_memory"] = f"{gpu.memoryFree}MB"
output_json["machine_info"]["gpu_" + str(i)]["used_memory"] = f"{gpu.memoryUsed}MB"
output_json["machine_info"]["gpu_" + str(i)]["temperature"] = f"{gpu.temperature} C"
output_json["machine_info"]["gpu_" + str(i)]["uuid"] = gpu.uuid

query_opts = NVSMI_QUERY_GPU.copy()
nvsmi = nvidia_smi.getInstance()
device_query = nvsmi.DeviceQuery([
query_opts["driver_version"],
query_opts["count"],
query_opts["index"],
query_opts["gpu_name"],
query_opts["gpu_uuid"],
query_opts["memory.total"],
query_opts["memory.used"],
query_opts["memory.free"],
query_opts["utilization.gpu"],
query_opts["utilization.memory"],
query_opts["temperature.gpu"]
])

output_json["machine_info"]["gpu_driver_version"] = device_query["driver_version"]

for gpu in device_query["gpu"]:
gpu_num = gpu["minor_number"]
output_json["machine_info"]["gpu_" + gpu_num] = {}
output_json["machine_info"]["gpu_" + gpu_num]["id"] = gpu_num
output_json["machine_info"]["gpu_" + gpu_num]["name"] = gpu["product_name"]
output_json["machine_info"][
"gpu_" + gpu_num]["utilization"] = f"{gpu['utilization']['gpu_util']}{gpu['utilization']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["total_memory"] = f"{gpu['fb_memory_usage']['total']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["used_memory"] = f"{gpu['fb_memory_usage']['used']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["free_memory"] = f"{gpu['fb_memory_usage']['free']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["temperature"] = f"{gpu['temperature']['gpu_temp']} {gpu['temperature']['unit']}"
output_json["machine_info"]["gpu_" + gpu_num]["uuid"] = gpu["uuid"]

for bench in output_json['benchmarks']:
if bench["name"] not in E2E_TEST_CONFIGS:
Expand Down

0 comments on commit c091731

Please sign in to comment.