NVIDIA · cpcloud · Apr 23, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd
@@ -17,5 +17,6 @@ cdef class Program:
         object _compile_lock  # Per-instance lock for compile-time mutation
         bint _use_libdevice      # Flag for libdevice loading
         bint _libdevice_added
-        bytes _nvrtc_code       # Source code for NVRTC retry (PCH auto-resize)
+        bytes _code             # Source code as bytes: used for key derivation and NVRTC PCH retry
+        str _code_type          # Normalised code_type ("c++", "ptx", "nvvm")
         str _pch_status         # PCH creation outcome after compile
diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
@@ -85,7 +85,12 @@ cdef class Program:
         self._h_nvvm.reset()
 
     def compile(
-        self, target_type: str, name_expressions: tuple | list = (), logs = None
+        self,
+        target_type: str,
+        name_expressions: tuple | list = (),
+        logs=None,
+        *,
+        cache: "ProgramCacheResource | None" = None,
     ) -> ObjectCode:
         """Compile the program to the specified target type.
 
@@ -98,13 +103,55 @@ cdef class Program:
             Used for template instantiation and similar cases.
         logs : object, optional
             Object with a ``write`` method to receive compilation logs.
+        cache : :class:`~cuda.core.utils.ProgramCacheResource`, optional
+            If provided, the compiled :class:`~cuda.core.ObjectCode` is looked
+            up in ``cache`` via a key derived from the program's code, options,
+            ``target_type`` and ``name_expressions``. On a hit the cached
+            ``ObjectCode`` is returned without re-compiling; on a miss the
+            fresh compile result is stored. Options that require an
+            ``extra_digest`` (``include_path``, ``pre_include``, ``pch``,
+            ``use_pch``, ``pch_dir``, NVVM ``use_libdevice=True``, or NVRTC
+            ``options.name`` with a directory component) raise ``ValueError``
+            via :func:`~cuda.core.utils.make_program_cache_key`; for those
+            compiles, use the manual ``make_program_cache_key(...)`` pattern
+            directly.
 
         Returns
         -------
         :class:`~cuda.core.ObjectCode`
             The compiled object code.
         """
-        return Program_compile(self, target_type, name_expressions, logs)
+        if cache is None:
+            return _program_compile_uncached(self, target_type, name_expressions, logs)
+
+        # Deferred import to avoid a circular import between _program and
+        # cuda.core.utils._program_cache (the cache module already imports
+        # ProgramOptions from this module). Import from the leaf module so
+        # tests that monkeypatch make_program_cache_key via that path
+        # intercept reliably.
+        from cuda.core.utils._program_cache import make_program_cache_key
+
+        # ``self._code`` is always stored as bytes (see ``Program_init``),
+        # but ``make_program_cache_key`` only accepts bytes when
+        # ``code_type == "nvvm"`` -- c++/ptx must be ``str``. Decode back
+        # to the original str for the NVRTC/linker paths so the generated
+        # key matches keys callers build by passing the str source
+        # directly.
+        code_for_key = self._code if self._code_type == "nvvm" else self._code.decode("utf-8")
+
+        key = make_program_cache_key(
+            code=code_for_key,
+            code_type=self._code_type,
+            options=self._options,
+            target_type=target_type,
+            name_expressions=name_expressions,
+        )
+        hit = cache.get(key)
+        if hit is not None:
+            return hit
+        compiled = _program_compile_uncached(self, target_type, name_expressions, logs)
+        cache[key] = compiled
+        return compiled
 
     @property
     def pch_status(self) -> str | None:
@@ -503,6 +550,19 @@ class ProgramOptions:
 # Private Classes and Helper Functions
 # =============================================================================
 
+
+def _program_compile_uncached(program, target_type, name_expressions, logs):
+    """Run ``Program_compile`` without the cache wrapper.
+
+    Module-level Python function so tests can monkeypatch it from
+    ``cuda.core._program`` to avoid invoking NVRTC when exercising the cache
+    wrapper in :meth:`Program.compile`. ``Program`` itself is a ``cdef class``
+    and its methods cannot be reassigned from Python, so the seam must live
+    outside the class.
+    """
+    return Program_compile(program, target_type, name_expressions, logs)
+
+
 # Module-level state for NVVM lazy loading
 _nvvm_module = None
 _nvvm_import_attempted = False
@@ -618,6 +678,7 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
 
     self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
     code_type = code_type.lower()
+    self._code_type = code_type
     self._compile_lock = threading.Lock()
     self._use_libdevice = False
     self._libdevice_added = False
@@ -638,16 +699,18 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
             HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
                 &nvrtc_prog, code_ptr, name_ptr, 0, NULL, NULL))
         self._h_nvrtc = create_nvrtc_program_handle(nvrtc_prog)
-        self._nvrtc_code = code_bytes
+        self._code = code_bytes
         self._backend = "NVRTC"
         self._linker = None
 
     elif code_type == "ptx":
         assert_type(code, str)
         if options.extra_sources is not None:
             raise ValueError("extra_sources is not supported by the PTX backend.")
+        code_bytes = code.encode()
+        self._code = code_bytes
         self._linker = Linker(
-            ObjectCode._init(code.encode(), code_type), options=_translate_program_options(options)
+            ObjectCode._init(code_bytes, code_type), options=_translate_program_options(options)
         )
         self._backend = self._linker.backend
 
@@ -657,10 +720,13 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
             code = code.encode("utf-8")
         elif not isinstance(code, (bytes, bytearray)):
             raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
+        self._code = bytes(code)  # Coerce bytearray -> bytes so retention type is stable
 
-        code_ptr = <const char*>(<bytes>code)
+        # Use self._code (strictly bytes) for the C pointer so a bytearray
+        # input doesn't trip the `<bytes>code` cast at runtime.
+        code_ptr = <const char*>self._code
         name_ptr = <const char*>options._name
-        code_len = len(code)
+        code_len = len(self._code)
 
         with nogil:
             HANDLE_RETURN_NVVM(NULL, cynvvm.nvvmCreateProgram(&nvvm_prog))
@@ -832,7 +898,7 @@ cdef object Program_compile_nvrtc(Program self, str target_type, object name_exp
         HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcSetPCHHeapSize(required))
 
     cdef cynvrtc.nvrtcProgram retry_prog
-    cdef const char* code_ptr = <const char*>self._nvrtc_code
+    cdef const char* code_ptr = <const char*>self._code
     cdef const char* name_ptr = <const char*>self._options._name
     with nogil:
         HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(

diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py
diff --git a/cuda_core/cuda/core/utils/__init__.py b/cuda_core/cuda/core/utils/__init__.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core._memoryview import (
+    StridedMemoryView,
+    args_viewable_as_strided_memory,
+)
+
+# Lazily expose the program-cache APIs so ``from cuda.core.utils import
+# StridedMemoryView`` stays lightweight -- the cache backends pull in driver,
+# NVRTC, and module-load machinery that memoryview-only consumers do not need.
+# The laziness guarantee is for explicit imports only: ``from cuda.core.utils
+# import *`` walks ``__all__`` and therefore resolves every lazy attribute,
+# which eagerly pulls ``_program_cache`` in. Star-imports are discouraged
+# anyway, so treat that as expected.
+_LAZY_CACHE_ATTRS = (
+    "FileStreamProgramCache",
+    "InMemoryProgramCache",
+    "ProgramCacheResource",
+    "SQLiteProgramCache",
+    "make_program_cache_key",
+)
+
+__all__ = [
+    "StridedMemoryView",
+    "args_viewable_as_strided_memory",
+    *_LAZY_CACHE_ATTRS,
+]
+
+
+def __getattr__(name):
+    if name in _LAZY_CACHE_ATTRS:
+        from cuda.core.utils import _program_cache
+
+        value = getattr(_program_cache, name)
+        globals()[name] = value  # cache for subsequent accesses
+        return value
+    raise AttributeError(f"module 'cuda.core.utils' has no attribute {name!r}")
+
+
+def __dir__():
+    # Merge the lazy public API with the real module namespace so REPL and
+    # introspection tools still surface ``__file__``, ``__spec__``, etc.
+    return sorted(set(globals()) | set(__all__))