Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 16 additions & 21 deletions nsight/thermovision.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from nsight.exceptions import CoolingTimeoutError

"""
This module provides GPU thermal monitoring and throttling prevention using NVIDIA's NVML library.
This module provides GPU thermal monitoring and throttling prevention using NVIDIA's NVML library
(as exposed through cuda.core.system).

It monitors GPU temperature and T.limit, and delays execution when the GPU
is too hot to avoid thermal throttling. The module uses an adaptive approach that
Expand All @@ -16,20 +17,13 @@

# Guard NVML imports
try:
from pynvml import (
NVML_TEMPERATURE_GPU,
NVMLError_NotSupported,
nvmlDeviceGetHandleByIndex,
nvmlDeviceGetMarginTemperature,
nvmlDeviceGetTemperature,
nvmlInit,
)
from cuda.core import system

PYNVML_AVAILABLE = True
CUDA_CORE_AVAILABLE = True
except ImportError:
PYNVML_AVAILABLE = False
CUDA_CORE_AVAILABLE = False
print(
"Warning: Cannot import pynvml (provided by nvidia-ml-py). Ensure nsight-python was installed properly with all dependencies."
"Warning: Cannot import cuda.core. Ensure nsight-python was installed properly with all dependencies."
)

# Default thermal threshold constants
Expand Down Expand Up @@ -80,7 +74,7 @@ def __init__(
verbose: Whether to print thermal messages.
Default: False
"""
self.handle: Any = None
self.device: Any = None
self.thermal_mode = thermal_mode

# Set adaptive_mode based on thermal_mode
Expand Down Expand Up @@ -110,12 +104,11 @@ def init(self) -> bool:
Returns:
True if temperature retrieval is supported, False otherwise.
"""
if not PYNVML_AVAILABLE:
if not CUDA_CORE_AVAILABLE:
return False

if self.handle is None:
nvmlInit()
self.handle = nvmlDeviceGetHandleByIndex(0)
if self.device is None:
self.device = system.Device(index=0)

return self._is_temp_retrieval_supported()

Expand Down Expand Up @@ -252,8 +245,8 @@ def _get_gpu_tlimit(self) -> int | None:
Thermal headroom in degrees Celsius, or None if not supported
"""
try:
return nvmlDeviceGetMarginTemperature(self.handle) # type: ignore[no-any-return]
except NVMLError_NotSupported as e:
return int(self.device.temperature.margin)
except system.NotSupportedError as e:
print("Error: GPU does not support temperature limit retrieval:", e)
return None
except Exception as e:
Expand All @@ -265,7 +258,9 @@ def _get_gpu_temp(self) -> int:
Returns:
GPU temperature in degrees Celsius
"""
return nvmlDeviceGetTemperature(self.handle, NVML_TEMPERATURE_GPU) # type: ignore[no-any-return]
return int(
self.device.temperature.sensor(system.TemperatureSensors.TEMPERATURE_GPU)
)

def _is_temp_retrieval_supported(self) -> bool:
"""Check if GPU supports temperature retrieval.
Expand All @@ -274,7 +269,7 @@ def _is_temp_retrieval_supported(self) -> bool:
True if supported, False otherwise
"""
try:
nvmlDeviceGetMarginTemperature(self.handle)
self.device.temperature.margin
return True
except Exception:
print(
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ dependencies = [
"pandas",
"numpy",
"matplotlib",
"nvidia-ml-py",
"ncu-report",
"deepdiff",
"cuda-core>=0.7.0",
"cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*",
]

classifiers = [
Expand Down