Skip to content

Commit 8933406

Browse files
authored
deps(nvidia-ml-py): add nvidia-ml-py 13.580.65 to support list (#178)
1 parent c166dfc commit 8933406

File tree

5 files changed

+138
-47
lines changed

5 files changed

+138
-47
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1313

1414
### Added
1515

16-
-
16+
- Add CUDA-13 NVML API support by [@XuehaiPan](https://github.com/XuehaiPan) in [#178](https://github.com/XuehaiPan/nvitop/pull/178).
1717

1818
### Changed
1919

nvitop/api/libnvml.py

Lines changed: 133 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# ==============================================================================
1717
"""Utilities for the NVML Python bindings (`nvidia-ml-py <https://pypi.org/project/nvidia-ml-py>`_)."""
1818

19-
# pylint: disable=invalid-name
19+
# pylint: disable=too-many-lines,invalid-name
2020

2121
from __future__ import annotations
2222

@@ -265,9 +265,13 @@ def _lazy_init() -> None:
265265
If cannot find function :func:`pynvml.nvmlInitWithFlags`, usually the :mod:`pynvml` module
266266
is overridden by other modules. Need to reinstall package ``nvidia-ml-py``.
267267
"""
268+
if __initialized:
269+
return
270+
268271
with __lock:
269272
if __initialized:
270-
return
273+
return # type: ignore[unreachable]
274+
271275
nvmlInit()
272276
_atexit.register(nvmlShutdown)
273277

@@ -531,12 +535,24 @@ def nvmlCheckReturn(retval: _Any, types: type | tuple[type, ...] | None = None,
531535
# Patch layers for backward compatibility ##########################################################
532536
_pynvml_installation_corrupted: bool = not callable(
533537
getattr(_pynvml, '_nvmlGetFunctionPointer', None),
534-
)
538+
) and isinstance(getattr(_pynvml, '_PrintableStructure', None), type)
535539

536540
# Patch function `nvmlDeviceGet{Compute,Graphics,MPSCompute}RunningProcesses`
537541
if not _pynvml_installation_corrupted:
542+
# pylint: disable-next=ungrouped-imports
543+
from pynvml import _nvmlGetFunctionPointer, _PrintableStructure, nvmlStructToFriendlyObject
544+
545+
def _nvmlLookupFunctionPointer(symbol: str) -> _Any | None:
546+
try:
547+
ptr = _nvmlGetFunctionPointer(symbol)
548+
except NVMLError_FunctionNotFound:
549+
LOGGER.debug('Failed to found symbol `%s`.', symbol)
550+
return None
551+
LOGGER.debug('Found symbol `%s`.', symbol)
552+
return ptr
553+
538554
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
539-
class c_nvmlProcessInfo_v1_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
555+
class c_nvmlProcessInfo_v1_t(_PrintableStructure):
540556
_fields_: _ClassVar[list[tuple[str, type]]] = [
541557
# Process ID
542558
('pid', _ctypes.c_uint),
@@ -550,7 +566,7 @@ class c_nvmlProcessInfo_v1_t(_pynvml._PrintableStructure): # pylint: disable=pr
550566
}
551567

552568
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
553-
class c_nvmlProcessInfo_v2_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
569+
class c_nvmlProcessInfo_v2_t(_PrintableStructure):
554570
_fields_: _ClassVar[list[tuple[str, type]]] = [
555571
# Process ID
556572
('pid', _ctypes.c_uint),
@@ -570,7 +586,7 @@ class c_nvmlProcessInfo_v2_t(_pynvml._PrintableStructure): # pylint: disable=pr
570586
}
571587

572588
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
573-
class c_nvmlProcessInfo_v3_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
589+
class c_nvmlProcessInfo_v3_t(_PrintableStructure):
574590
_fields_: _ClassVar[list[tuple[str, type]]] = [
575591
# Process ID
576592
('pid', _ctypes.c_uint),
@@ -599,22 +615,11 @@ def __determine_get_running_processes_version_suffix() -> str:
599615
global __get_running_processes_version_suffix, c_nvmlProcessInfo_t # pylint: disable=global-statement
600616

601617
if __get_running_processes_version_suffix is None:
602-
# pylint: disable-next=protected-access,no-member
603-
nvmlGetFunctionPointer = _pynvml._nvmlGetFunctionPointer
604618
__get_running_processes_version_suffix = '_v3'
605-
606-
def lookup(symbol: str) -> _Any | None:
607-
try:
608-
ptr = nvmlGetFunctionPointer(symbol)
609-
except NVMLError_FunctionNotFound:
610-
LOGGER.debug('Failed to found symbol `%s`.', symbol)
611-
return None
612-
LOGGER.debug('Found symbol `%s`.', symbol)
613-
return ptr
614-
615-
if lookup('nvmlDeviceGetComputeRunningProcesses_v3'):
616-
if lookup('nvmlDeviceGetConfComputeMemSizeInfo') and not lookup(
617-
'nvmlDeviceGetRunningProcessDetailList',
619+
if _nvmlLookupFunctionPointer('nvmlDeviceGetComputeRunningProcesses_v3') is not None:
620+
if (
621+
_nvmlLookupFunctionPointer('nvmlDeviceGetConfComputeMemSizeInfo') is not None
622+
and _nvmlLookupFunctionPointer('nvmlDeviceGetRunningProcessDetailList') is None
618623
):
619624
LOGGER.debug(
620625
'NVML get running process version 3 API with v3 type struct is available.',
@@ -634,7 +639,10 @@ def lookup(symbol: str) -> _Any | None:
634639
'due to incompatible NVIDIA driver. Fallback to use get running process '
635640
'version 2 API with v2 type struct.',
636641
)
637-
if lookup('nvmlDeviceGetComputeRunningProcesses_v2'):
642+
if (
643+
_nvmlLookupFunctionPointer('nvmlDeviceGetComputeRunningProcesses_v2')
644+
is not None
645+
):
638646
LOGGER.debug(
639647
'NVML get running process version 2 API with v2 type struct is available.',
640648
)
@@ -663,8 +671,7 @@ def __nvml_device_get_running_processes(
663671

664672
# First call to get the size
665673
c_count = _ctypes.c_uint(0)
666-
# pylint: disable-next=protected-access
667-
fn = _pynvml._nvmlGetFunctionPointer(f'{func}{version_suffix}')
674+
fn = _nvmlGetFunctionPointer(f'{func}{version_suffix}')
668675
ret = fn(handle, _ctypes.byref(c_count), None)
669676

670677
if ret == NVML_SUCCESS:
@@ -679,12 +686,13 @@ def __nvml_device_get_running_processes(
679686

680687
# Make the call again
681688
ret = fn(handle, _ctypes.byref(c_count), c_processes)
682-
_pynvml._nvmlCheckReturn(ret) # pylint: disable=protected-access
689+
if ret != NVML_SUCCESS:
690+
raise NVMLError(ret)
683691

684692
processes = []
685693
for i in range(c_count.value):
686694
# Use an alternative struct for this object
687-
obj = _pynvml.nvmlStructToFriendlyObject(c_processes[i])
695+
obj = nvmlStructToFriendlyObject(c_processes[i])
688696
if obj.usedGpuMemory == ULONGLONG_MAX:
689697
# Special case for WDDM on Windows, see comment above
690698
obj.usedGpuMemory = None
@@ -781,7 +789,7 @@ def nvmlDeviceGetMPSComputeRunningProcesses( # pylint: disable=function-redefin
781789
# Patch function `nvmlDeviceGetMemoryInfo`
782790
if not _pynvml_installation_corrupted:
783791
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
784-
class c_nvmlMemory_v1_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
792+
class c_nvmlMemory_v1_t(_PrintableStructure):
785793
_fields_: _ClassVar[list[tuple[str, type]]] = [
786794
# Total physical device memory (in bytes).
787795
('total', _ctypes.c_ulonglong),
@@ -794,7 +802,7 @@ class c_nvmlMemory_v1_t(_pynvml._PrintableStructure): # pylint: disable=protect
794802
_fmt_: _ClassVar[dict[str, str]] = {'<default>': '%d B'}
795803

796804
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
797-
class c_nvmlMemory_v2_t(_pynvml._PrintableStructure): # pylint: disable=protected-access
805+
class c_nvmlMemory_v2_t(_PrintableStructure):
798806
_fields_: _ClassVar[list[tuple[str, type]]] = [
799807
# Structure format version (must be 2).
800808
('version', _ctypes.c_uint),
@@ -810,30 +818,24 @@ class c_nvmlMemory_v2_t(_pynvml._PrintableStructure): # pylint: disable=protect
810818
]
811819
_fmt_: _ClassVar[dict[str, str]] = {'<default>': '%d B'}
812820

813-
nvmlMemory_v2 = getattr(_pynvml, 'nvmlMemory_v2', _ctypes.sizeof(c_nvmlMemory_v2_t) | 2 << 24)
821+
nvmlMemory_v2 = getattr(_pynvml, 'nvmlMemory_v2', _ctypes.sizeof(c_nvmlMemory_v2_t) | (2 << 24))
814822
__get_memory_info_version_suffix: str | None = None
815823
c_nvmlMemory_t = c_nvmlMemory_v2_t
816824

817825
def __determine_get_memory_info_version_suffix() -> str:
818826
global __get_memory_info_version_suffix, c_nvmlMemory_t # pylint: disable=global-statement
819827

820828
if __get_memory_info_version_suffix is None:
821-
# pylint: disable-next=protected-access,no-member
822-
nvml_get_function_pointer = _pynvml._nvmlGetFunctionPointer
823829
__get_memory_info_version_suffix = '_v2'
824-
try:
825-
nvml_get_function_pointer('nvmlDeviceGetMemoryInfo_v2')
826-
except NVMLError_FunctionNotFound:
827-
LOGGER.debug('Failed to found symbol `nvmlDeviceGetMemoryInfo_v2`.')
830+
if _nvmlLookupFunctionPointer('nvmlDeviceGetMemoryInfo_v2') is not None:
831+
LOGGER.debug('NVML get memory info version 2 is available.')
832+
else:
828833
c_nvmlMemory_t = c_nvmlMemory_v1_t
829834
__get_memory_info_version_suffix = ''
830835
LOGGER.debug(
831836
'NVML get memory info version 2 API is not available due to incompatible '
832837
'NVIDIA driver. Fallback to use NVML get memory info version 1 API.',
833838
)
834-
else:
835-
LOGGER.debug('Found symbol `nvmlDeviceGetMemoryInfo_v2`.')
836-
LOGGER.debug('NVML get memory info version 2 is available.')
837839

838840
return __get_memory_info_version_suffix
839841

@@ -865,19 +867,19 @@ def nvmlDeviceGetMemoryInfo( # pylint: disable=function-redefined
865867
if version_suffix == '_v2':
866868
c_memory = c_nvmlMemory_v2_t()
867869
c_memory.version = nvmlMemory_v2 # pylint: disable=attribute-defined-outside-init
868-
# pylint: disable-next=protected-access
869-
fn = _pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo_v2')
870870
elif version_suffix in {'_v1', ''}:
871871
c_memory = c_nvmlMemory_v1_t()
872-
# pylint: disable-next=protected-access
873-
fn = _pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo')
872+
version_suffix = ''
874873
else:
875874
raise ValueError(
876875
f'Unknown version suffix {version_suffix!r} for '
877876
'function `nvmlDeviceGetMemoryInfo`.',
878877
)
878+
879+
fn = _nvmlGetFunctionPointer(f'nvmlDeviceGetMemoryInfo{version_suffix}')
879880
ret = fn(handle, _ctypes.byref(c_memory))
880-
_pynvml._nvmlCheckReturn(ret) # pylint: disable=protected-access
881+
if ret != NVML_SUCCESS:
882+
raise NVMLError(ret)
881883
return c_memory
882884

883885
else:
@@ -888,6 +890,94 @@ def nvmlDeviceGetMemoryInfo( # pylint: disable=function-redefined
888890
'`nvidia-ml-py` via `pip3 install --force-reinstall nvidia-ml-py nvitop`.',
889891
)
890892

893+
# Patch function `nvmlDeviceGetTemperature`
894+
if not _pynvml_installation_corrupted:
895+
# pylint: disable-next=missing-class-docstring,too-few-public-methods,function-redefined
896+
class c_nvmlTemperature_v1_t(_PrintableStructure):
897+
_fields_: _ClassVar[list[tuple[str, type]]] = [
898+
# Structure format version (must be 1).
899+
('version', _ctypes.c_uint),
900+
# Sensor type.
901+
('sensorType', _ctypes.c_uint),
902+
# Temperature in degrees Celsius.
903+
('temperature', _ctypes.c_int),
904+
]
905+
906+
nvmlTemperature_v1: int = getattr(
907+
_pynvml,
908+
'nvmlTemperature_v1',
909+
_ctypes.sizeof(c_nvmlTemperature_v1_t) | (1 << 24),
910+
)
911+
__get_temperature_version_suffix: str | None = None
912+
913+
def __determine_get_temperature_version_suffix() -> str:
914+
"""Determine the version suffix for the NVML temperature functions."""
915+
global __get_temperature_version_suffix # pylint: disable=global-statement
916+
917+
if __get_temperature_version_suffix is None:
918+
__get_temperature_version_suffix = 'V'
919+
if _nvmlLookupFunctionPointer('nvmlDeviceGetTemperatureV') is not None:
920+
LOGGER.debug('NVML get temperature version 1 API is available.')
921+
else:
922+
__get_temperature_version_suffix = ''
923+
LOGGER.debug(
924+
'NVML get temperature version 1 API is not available due to incompatible '
925+
'NVIDIA driver. Fallback to use NVML get temperature API without version.',
926+
)
927+
928+
return __get_temperature_version_suffix
929+
930+
def nvmlDeviceGetTemperature( # pylint: disable=function-redefined
931+
handle: c_nvmlDevice_t,
932+
sensor: int,
933+
) -> int:
934+
"""Retrieve the current temperature readings (in degrees C) for the given device.
935+
936+
Raises:
937+
NVMLError_Uninitialized:
938+
If NVML was not first initialized with :func:`nvmlInit`.
939+
NVMLError_InvalidArgument:
940+
If device is invalid, sensorType is invalid or temp is NULL.
941+
NVMLError_NotSupported:
942+
If the device does not have the specified sensor.
943+
NVMLError_GpuIsLost:
944+
If the target GPU has fallen off the bus or is otherwise inaccessible.
945+
NVMLError_Unknown:
946+
On any unexpected error.
947+
"""
948+
version_suffix = __determine_get_temperature_version_suffix()
949+
if version_suffix == 'V':
950+
c_temp_v1 = c_nvmlTemperature_v1_t()
951+
# pylint: disable-next=attribute-defined-outside-init
952+
c_temp_v1.version = nvmlTemperature_v1
953+
# pylint: disable-next=attribute-defined-outside-init
954+
c_temp_v1.sensorType = _ctypes.c_uint(sensor)
955+
fn = _nvmlGetFunctionPointer('nvmlDeviceGetTemperatureV')
956+
ret = fn(handle, _ctypes.byref(c_temp_v1))
957+
if ret != NVML_SUCCESS:
958+
raise NVMLError(ret)
959+
return int(c_temp_v1.temperature)
960+
961+
if version_suffix == '':
962+
c_temp = _ctypes.c_uint(0)
963+
fn = _nvmlGetFunctionPointer('nvmlDeviceGetTemperature')
964+
ret = fn(handle, _ctypes.c_uint(sensor), _ctypes.byref(c_temp))
965+
if ret != NVML_SUCCESS:
966+
raise NVMLError(ret)
967+
return c_temp.value
968+
969+
raise ValueError(
970+
f'Unknown version suffix {version_suffix!r} for function `nvmlDeviceGetTemperature`.',
971+
)
972+
973+
else:
974+
LOGGER.warning(
975+
'Your installed package `nvidia-ml-py` is corrupted. '
976+
'Skip patch functions `nvmlDeviceGetTemperature`. '
977+
'You may get incorrect or incomplete results. Please consider reinstall package '
978+
'`nvidia-ml-py` via `pip3 install --force-reinstall nvidia-ml-py nvitop`.',
979+
)
980+
891981

892982
# Add support for lookup fallback and context manager ##############################################
893983
class _CustomModule(_ModuleType):

nvitop/version.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
PYNVML_VERSION_CANDIDATES = (
6969
# Sync with pyproject.toml and requirements.txt
7070
'11.450.51', # the last version supports the R430 driver (CUDA 10.x)
71-
'11.450.129', # requires at last the R450 driver
71+
'11.450.129', # requires at least the R450 driver
7272
'11.460.79',
7373
'11.470.66',
7474
'11.495.46',
@@ -90,6 +90,7 @@
9090
'12.570.86',
9191
'12.570.172',
9292
'12.575.51',
93+
'13.580.65',
9394
)
9495
"""The list of supported ``nvidia-ml-py`` versions.
9596
See also: `nvidia-ml-py's Release History <https://pypi.org/project/nvidia-ml-py/#history>`_.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ classifiers = [
4747
]
4848
dependencies = [
4949
# Sync with nvitop/version.py and requirements.txt
50-
"nvidia-ml-py >= 11.450.51, < 12.576.0a0",
50+
"nvidia-ml-py >= 11.450.51, < 13.581.0a0",
5151
"psutil >= 5.6.6",
5252
"colorama >= 0.4.0; platform_system == 'Windows'",
5353
"windows-curses >= 2.2.0; platform_system == 'Windows'",

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Sync with pyproject.toml and nvitop/version.py
2-
nvidia-ml-py >= 11.450.51, < 12.576.0a0
2+
nvidia-ml-py >= 11.450.51, < 13.581.0a0
33
psutil >= 5.6.6
44
colorama >= 0.4.0; platform_system == 'Windows'
55
windows-curses >= 2.2.0; platform_system == 'Windows'

0 commit comments

Comments
 (0)