diff --git a/nsight/thermovision.py b/nsight/thermovision.py index 474291c..ac1dc5c 100644 --- a/nsight/thermovision.py +++ b/nsight/thermovision.py @@ -7,7 +7,8 @@ from nsight.exceptions import CoolingTimeoutError """ -This module provides GPU thermal monitoring and throttling prevention using NVIDIA's NVML library. +This module provides GPU thermal monitoring and throttling prevention using NVIDIA's NVML library +(as exposed through cuda.core.system). It monitors GPU temperature and T.limit, and delays execution when the GPU is too hot to avoid thermal throttling. The module uses an adaptive approach that @@ -16,20 +17,13 @@ # Guard NVML imports try: - from pynvml import ( - NVML_TEMPERATURE_GPU, - NVMLError_NotSupported, - nvmlDeviceGetHandleByIndex, - nvmlDeviceGetMarginTemperature, - nvmlDeviceGetTemperature, - nvmlInit, - ) + from cuda.core import system - PYNVML_AVAILABLE = True + CUDA_CORE_AVAILABLE = True except ImportError: - PYNVML_AVAILABLE = False + CUDA_CORE_AVAILABLE = False print( - "Warning: Cannot import pynvml (provided by nvidia-ml-py). Ensure nsight-python was installed properly with all dependencies." + "Warning: Cannot import cuda.core. Ensure nsight-python was installed properly with all dependencies." ) # Default thermal threshold constants @@ -80,7 +74,7 @@ def __init__( verbose: Whether to print thermal messages. Default: False """ - self.handle: Any = None + self.device: Any = None self.thermal_mode = thermal_mode # Set adaptive_mode based on thermal_mode @@ -110,12 +104,11 @@ def init(self) -> bool: Returns: True if temperature retrieval is supported, False otherwise. """ - if not PYNVML_AVAILABLE: + if not CUDA_CORE_AVAILABLE: return False - if self.handle is None: - nvmlInit() - self.handle = nvmlDeviceGetHandleByIndex(0) + if self.device is None: + self.device = system.Device(index=0) return self._is_temp_retrieval_supported() @@ -252,8 +245,8 @@ def _get_gpu_tlimit(self) -> int | None: Thermal headroom in degrees Celsius, or None if not supported """ try: - return nvmlDeviceGetMarginTemperature(self.handle) # type: ignore[no-any-return] - except NVMLError_NotSupported as e: + return int(self.device.temperature.margin) + except system.NotSupportedError as e: print("Error: GPU does not support temperature limit retrieval:", e) return None except Exception as e: @@ -265,7 +258,9 @@ def _get_gpu_temp(self) -> int: Returns: GPU temperature in degrees Celsius """ - return nvmlDeviceGetTemperature(self.handle, NVML_TEMPERATURE_GPU) # type: ignore[no-any-return] + return int( + self.device.temperature.sensor(system.TemperatureSensors.TEMPERATURE_GPU) + ) def _is_temp_retrieval_supported(self) -> bool: """Check if GPU supports temperature retrieval. @@ -274,7 +269,7 @@ def _is_temp_retrieval_supported(self) -> bool: True if supported, False otherwise """ try: - nvmlDeviceGetMarginTemperature(self.handle) + self.device.temperature.margin return True except Exception: print( diff --git a/pyproject.toml b/pyproject.toml index 6d4758c..488c731 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,9 +20,10 @@ dependencies = [ "pandas", "numpy", "matplotlib", - "nvidia-ml-py", "ncu-report", "deepdiff", + "cuda-core>=0.7.0", + "cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*", ] classifiers = [