From b7a6b0004bda2a5a14fbe572ed9b5b7e8258ad89 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 15 Apr 2026 18:40:02 -0400 Subject: [PATCH 1/6] Migrate from pynvml to cuda.core.system --- nsight/thermovision.py | 33 +++++++++++++-------------------- pyproject.toml | 2 +- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/nsight/thermovision.py b/nsight/thermovision.py index 474291c..ca53a8d 100644 --- a/nsight/thermovision.py +++ b/nsight/thermovision.py @@ -7,7 +7,8 @@ from nsight.exceptions import CoolingTimeoutError """ -This module provides GPU thermal monitoring and throttling prevention using NVIDIA's NVML library. +This module provides GPU thermal monitoring and throttling prevention using NVIDIA's NVML library +(as exposed through cuda.core.system). It monitors GPU temperature and T.limit, and delays execution when the GPU is too hot to avoid thermal throttling. The module uses an adaptive approach that @@ -16,20 +17,13 @@ # Guard NVML imports try: - from pynvml import ( - NVML_TEMPERATURE_GPU, - NVMLError_NotSupported, - nvmlDeviceGetHandleByIndex, - nvmlDeviceGetMarginTemperature, - nvmlDeviceGetTemperature, - nvmlInit, - ) + from cuda.core import system - PYNVML_AVAILABLE = True + CUDA_CORE_AVAILABLE = True except ImportError: - PYNVML_AVAILABLE = False + CUDA_CORE_AVAILABLE = False print( - "Warning: Cannot import pynvml (provided by nvidia-ml-py). Ensure nsight-python was installed properly with all dependencies." + "Warning: Cannot import cuda.core. Ensure nsight-python was installed properly with all dependencies." ) # Default thermal threshold constants @@ -110,12 +104,11 @@ def init(self) -> bool: Returns: True if temperature retrieval is supported, False otherwise. """ - if not PYNVML_AVAILABLE: + if not CUDA_CORE_AVAILABLE: return False - if self.handle is None: - nvmlInit() - self.handle = nvmlDeviceGetHandleByIndex(0) + if self.device is None: + self.device = system.Device(index=0) return self._is_temp_retrieval_supported() @@ -252,8 +245,8 @@ def _get_gpu_tlimit(self) -> int | None: Thermal headroom in degrees Celsius, or None if not supported """ try: - return nvmlDeviceGetMarginTemperature(self.handle) # type: ignore[no-any-return] - except NVMLError_NotSupported as e: + self.device.temperature.margin + except system.NotSupportedError as e: print("Error: GPU does not support temperature limit retrieval:", e) return None except Exception as e: @@ -265,7 +258,7 @@ def _get_gpu_temp(self) -> int: Returns: GPU temperature in degrees Celsius """ - return nvmlDeviceGetTemperature(self.handle, NVML_TEMPERATURE_GPU) # type: ignore[no-any-return] + return self.device.temperature.sensor(system.TemperatureSensors.TEMPERATURE_GPU) def _is_temp_retrieval_supported(self) -> bool: """Check if GPU supports temperature retrieval. @@ -274,7 +267,7 @@ def _is_temp_retrieval_supported(self) -> bool: True if supported, False otherwise """ try: - nvmlDeviceGetMarginTemperature(self.handle) + self.device.temperature.margin return True except Exception: print( diff --git a/pyproject.toml b/pyproject.toml index 6d4758c..9cf969c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,9 +20,9 @@ dependencies = [ "pandas", "numpy", "matplotlib", - "nvidia-ml-py", "ncu-report", "deepdiff", + "cuda-core>=0.7.0" ] classifiers = [ From 9d2fa7de4e143aaeb674188860ba51a537f84a36 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 21 Apr 2026 09:05:07 -0400 Subject: [PATCH 2/6] Address commit in PR --- nsight/thermovision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nsight/thermovision.py b/nsight/thermovision.py index ca53a8d..c345214 100644 --- a/nsight/thermovision.py +++ b/nsight/thermovision.py @@ -74,7 +74,7 @@ def __init__( verbose: Whether to print thermal messages. Default: False """ - self.handle: Any = None + self.device: Any = None self.thermal_mode = thermal_mode # Set adaptive_mode based on thermal_mode From ff32fb1f348b4900eec45ad3d7039df6ccf8a6b4 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 21 Apr 2026 09:06:34 -0400 Subject: [PATCH 3/6] Fix linter issues --- nsight/thermovision.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nsight/thermovision.py b/nsight/thermovision.py index c345214..56c21fd 100644 --- a/nsight/thermovision.py +++ b/nsight/thermovision.py @@ -245,7 +245,7 @@ def _get_gpu_tlimit(self) -> int | None: Thermal headroom in degrees Celsius, or None if not supported """ try: - self.device.temperature.margin + return self.device.temperature.margin except system.NotSupportedError as e: print("Error: GPU does not support temperature limit retrieval:", e) return None @@ -258,7 +258,7 @@ def _get_gpu_temp(self) -> int: Returns: GPU temperature in degrees Celsius """ - return self.device.temperature.sensor(system.TemperatureSensors.TEMPERATURE_GPU) + return int(self.device.temperature.sensor(system.TemperatureSensors.TEMPERATURE_GPU)) def _is_temp_retrieval_supported(self) -> bool: """Check if GPU supports temperature retrieval. From 432f07975b4e3c58f3367fc512d383a2615f6199 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 22 Apr 2026 08:41:13 -0400 Subject: [PATCH 4/6] Update dependencies --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9cf969c..488c731 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,8 @@ dependencies = [ "matplotlib", "ncu-report", "deepdiff", - "cuda-core>=0.7.0" + "cuda-core>=0.7.0", + "cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*", ] classifiers = [ From 029fad1009cb0b0aefbf9dfcf5ac04e1ec57aeb2 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 22 Apr 2026 08:42:46 -0400 Subject: [PATCH 5/6] black --- nsight/thermovision.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nsight/thermovision.py b/nsight/thermovision.py index 56c21fd..24bef1d 100644 --- a/nsight/thermovision.py +++ b/nsight/thermovision.py @@ -258,7 +258,9 @@ def _get_gpu_temp(self) -> int: Returns: GPU temperature in degrees Celsius """ - return int(self.device.temperature.sensor(system.TemperatureSensors.TEMPERATURE_GPU)) + return int( + self.device.temperature.sensor(system.TemperatureSensors.TEMPERATURE_GPU) + ) def _is_temp_retrieval_supported(self) -> bool: """Check if GPU supports temperature retrieval. From f9086cfe2e4710163405a9b1ea284b7a9c2054a7 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 22 Apr 2026 08:46:13 -0400 Subject: [PATCH 6/6] mypy --- nsight/thermovision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nsight/thermovision.py b/nsight/thermovision.py index 24bef1d..ac1dc5c 100644 --- a/nsight/thermovision.py +++ b/nsight/thermovision.py @@ -245,7 +245,7 @@ def _get_gpu_tlimit(self) -> int | None: Thermal headroom in degrees Celsius, or None if not supported """ try: - return self.device.temperature.margin + return int(self.device.temperature.margin) except system.NotSupportedError as e: print("Error: GPU does not support temperature limit retrieval:", e) return None