Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
fce123f
feat(core.utils): add program caches (in-memory, sqlite, filestream)
cpcloud Apr 23, 2026
4dab6fc
refactor(core): unify Program source retention into _code field
cpcloud Apr 24, 2026
6e5a489
fix(core.utils): treat no_source_include as truthy-only in cache key …
cpcloud Apr 24, 2026
884bde4
refactor(core): extract _raw_compile seam on Program
cpcloud Apr 24, 2026
0fad0d0
feat(core): add Program.compile(cache=...) convenience wrapper
cpcloud Apr 24, 2026
7fd2b41
test(core): cover Program.compile(cache=...) behavior and error paths
cpcloud Apr 24, 2026
a4b8bff
fix(core): use self._code for NVVM pointer so bytearray input compiles
cpcloud Apr 24, 2026
0d40c10
test(core): regression test for Program(bytearray, 'nvvm') input
cpcloud Apr 24, 2026
5330398
test(core): integration tests for Program.compile(cache=...)
cpcloud Apr 24, 2026
1967e6d
docs(core): note Program.compile(cache=...) in api reference
cpcloud Apr 24, 2026
e1a45b6
style(core): silence ruff ARG005 on unused lambda args in cache tests
cpcloud Apr 24, 2026
29e2b13
fix(core): move cache-wrapper compile seam to module level for monkey…
cpcloud Apr 24, 2026
23a69d4
fix(core.utils): drop foreign entries table when schema_meta is absent
cpcloud Apr 24, 2026
540dcbf
fix(core.utils): treat empty schema_meta as unknown-version for entri…
cpcloud Apr 24, 2026
1773340
test(core): use local ProgramOptions for cache-key derivation (Progra…
cpcloud Apr 24, 2026
6f9feaf
style(core.utils): apply ruff format to _program_cache.py
cpcloud Apr 24, 2026
cae0824
fix(core.utils): reject bytearray in NVRTC name_expressions cache keys
cpcloud Apr 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cuda_core/cuda/core/_program.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ cdef class Program:
object _compile_lock # Per-instance lock for compile-time mutation
bint _use_libdevice # Flag for libdevice loading
bint _libdevice_added
bytes _nvrtc_code # Source code for NVRTC retry (PCH auto-resize)
bytes _code # Source code as bytes: used for key derivation and NVRTC PCH retry
str _code_type # Normalised code_type ("c++", "ptx", "nvvm")
str _pch_status # PCH creation outcome after compile
80 changes: 73 additions & 7 deletions cuda_core/cuda/core/_program.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,12 @@ cdef class Program:
self._h_nvvm.reset()

def compile(
self, target_type: str, name_expressions: tuple | list = (), logs = None
self,
target_type: str,
name_expressions: tuple | list = (),
logs=None,
*,
cache: "ProgramCacheResource | None" = None,
) -> ObjectCode:
"""Compile the program to the specified target type.

Expand All @@ -98,13 +103,55 @@ cdef class Program:
Used for template instantiation and similar cases.
logs : object, optional
Object with a ``write`` method to receive compilation logs.
cache : :class:`~cuda.core.utils.ProgramCacheResource`, optional
If provided, the compiled :class:`~cuda.core.ObjectCode` is looked
up in ``cache`` via a key derived from the program's code, options,
``target_type`` and ``name_expressions``. On a hit the cached
``ObjectCode`` is returned without re-compiling; on a miss the
fresh compile result is stored. Options that require an
``extra_digest`` (``include_path``, ``pre_include``, ``pch``,
``use_pch``, ``pch_dir``, NVVM ``use_libdevice=True``, or NVRTC
``options.name`` with a directory component) raise ``ValueError``
via :func:`~cuda.core.utils.make_program_cache_key`; for those
compiles, use the manual ``make_program_cache_key(...)`` pattern
directly.

Returns
-------
:class:`~cuda.core.ObjectCode`
The compiled object code.
"""
return Program_compile(self, target_type, name_expressions, logs)
if cache is None:
return _program_compile_uncached(self, target_type, name_expressions, logs)

# Deferred import to avoid a circular import between _program and
# cuda.core.utils._program_cache (the cache module already imports
# ProgramOptions from this module). Import from the leaf module so
# tests that monkeypatch make_program_cache_key via that path
# intercept reliably.
from cuda.core.utils._program_cache import make_program_cache_key

# ``self._code`` is always stored as bytes (see ``Program_init``),
# but ``make_program_cache_key`` only accepts bytes when
# ``code_type == "nvvm"`` -- c++/ptx must be ``str``. Decode back
# to the original str for the NVRTC/linker paths so the generated
# key matches keys callers build by passing the str source
# directly.
code_for_key = self._code if self._code_type == "nvvm" else self._code.decode("utf-8")

key = make_program_cache_key(
code=code_for_key,
code_type=self._code_type,
options=self._options,
target_type=target_type,
name_expressions=name_expressions,
)
hit = cache.get(key)
if hit is not None:
return hit
compiled = _program_compile_uncached(self, target_type, name_expressions, logs)
cache[key] = compiled
return compiled

@property
def pch_status(self) -> str | None:
Expand Down Expand Up @@ -503,6 +550,19 @@ class ProgramOptions:
# Private Classes and Helper Functions
# =============================================================================


def _program_compile_uncached(program, target_type, name_expressions, logs):
"""Run ``Program_compile`` without the cache wrapper.

Module-level Python function so tests can monkeypatch it from
``cuda.core._program`` to avoid invoking NVRTC when exercising the cache
wrapper in :meth:`Program.compile`. ``Program`` itself is a ``cdef class``
and its methods cannot be reassigned from Python, so the seam must live
outside the class.
"""
return Program_compile(program, target_type, name_expressions, logs)


# Module-level state for NVVM lazy loading
_nvvm_module = None
_nvvm_import_attempted = False
Expand Down Expand Up @@ -618,6 +678,7 @@ cdef inline int Program_init(Program self, object code, str code_type, object op

self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
code_type = code_type.lower()
self._code_type = code_type
self._compile_lock = threading.Lock()
self._use_libdevice = False
self._libdevice_added = False
Expand All @@ -638,16 +699,18 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
&nvrtc_prog, code_ptr, name_ptr, 0, NULL, NULL))
self._h_nvrtc = create_nvrtc_program_handle(nvrtc_prog)
self._nvrtc_code = code_bytes
self._code = code_bytes
self._backend = "NVRTC"
self._linker = None

elif code_type == "ptx":
assert_type(code, str)
if options.extra_sources is not None:
raise ValueError("extra_sources is not supported by the PTX backend.")
code_bytes = code.encode()
self._code = code_bytes
self._linker = Linker(
ObjectCode._init(code.encode(), code_type), options=_translate_program_options(options)
ObjectCode._init(code_bytes, code_type), options=_translate_program_options(options)
)
self._backend = self._linker.backend

Expand All @@ -657,10 +720,13 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
code = code.encode("utf-8")
elif not isinstance(code, (bytes, bytearray)):
raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
self._code = bytes(code) # Coerce bytearray -> bytes so retention type is stable

code_ptr = <const char*>(<bytes>code)
# Use self._code (strictly bytes) for the C pointer so a bytearray
# input doesn't trip the `<bytes>code` cast at runtime.
code_ptr = <const char*>self._code
name_ptr = <const char*>options._name
code_len = len(code)
code_len = len(self._code)

with nogil:
HANDLE_RETURN_NVVM(NULL, cynvvm.nvvmCreateProgram(&nvvm_prog))
Expand Down Expand Up @@ -832,7 +898,7 @@ cdef object Program_compile_nvrtc(Program self, str target_type, object name_exp
HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcSetPCHHeapSize(required))

cdef cynvrtc.nvrtcProgram retry_prog
cdef const char* code_ptr = <const char*>self._nvrtc_code
cdef const char* code_ptr = <const char*>self._code
cdef const char* name_ptr = <const char*>self._options._name
with nogil:
HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
Expand Down
8 changes: 0 additions & 8 deletions cuda_core/cuda/core/utils.py

This file was deleted.

45 changes: 45 additions & 0 deletions cuda_core/cuda/core/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

from cuda.core._memoryview import (
StridedMemoryView,
args_viewable_as_strided_memory,
)

# Lazily expose the program-cache APIs so ``from cuda.core.utils import
# StridedMemoryView`` stays lightweight -- the cache backends pull in driver,
# NVRTC, and module-load machinery that memoryview-only consumers do not need.
# The laziness guarantee is for explicit imports only: ``from cuda.core.utils
# import *`` walks ``__all__`` and therefore resolves every lazy attribute,
# which eagerly pulls ``_program_cache`` in. Star-imports are discouraged
# anyway, so treat that as expected.
_LAZY_CACHE_ATTRS = (
"FileStreamProgramCache",
"InMemoryProgramCache",
"ProgramCacheResource",
"SQLiteProgramCache",
"make_program_cache_key",
)

__all__ = [
"StridedMemoryView",
"args_viewable_as_strided_memory",
*_LAZY_CACHE_ATTRS,
]


def __getattr__(name):
if name in _LAZY_CACHE_ATTRS:
from cuda.core.utils import _program_cache

value = getattr(_program_cache, name)
globals()[name] = value # cache for subsequent accesses
return value
raise AttributeError(f"module 'cuda.core.utils' has no attribute {name!r}")


def __dir__():
# Merge the lazy public API with the real module namespace so REPL and
# introspection tools still surface ``__file__``, ``__spec__``, etc.
return sorted(set(globals()) | set(__all__))
Loading
Loading