From 353a38295e10aa99bb2196ce16b83d0075ec5c86 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 24 Apr 2026 23:16:58 +0000 Subject: [PATCH 01/11] Implement green context v1 API --- cuda_core/cuda/core/__init__.py | 8 + cuda_core/cuda/core/_context.pxd | 9 +- cuda_core/cuda/core/_context.pyx | 30 +- cuda_core/cuda/core/_cpp/resource_handles.cpp | 95 +++++ cuda_core/cuda/core/_cpp/resource_handles.hpp | 45 ++ cuda_core/cuda/core/_device.pyx | 101 ++++- cuda_core/cuda/core/_device_resources.pxd | 40 ++ cuda_core/cuda/core/_device_resources.pyx | 400 ++++++++++++++++++ cuda_core/cuda/core/_resource_handles.pxd | 14 + cuda_core/cuda/core/_resource_handles.pyx | 26 ++ cuda_core/tests/test_green_context.py | 123 ++++++ 11 files changed, 886 insertions(+), 5 deletions(-) create mode 100644 cuda_core/cuda/core/_device_resources.pxd create mode 100644 cuda_core/cuda/core/_device_resources.pyx create mode 100644 cuda_core/tests/test_green_context.py diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index dfd52accea3..fe1ba76806a 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -29,7 +29,15 @@ def _import_versioned_module(): from cuda.core import system, utils +from cuda.core._context import Context, ContextOptions from cuda.core._device import Device +from cuda.core._device_resources import ( + DeviceResources, + SMResource, + SMResourceOptions, + WorkqueueResource, + WorkqueueResourceOptions, +) from cuda.core._event import Event, EventOptions from cuda.core._graphics import GraphicsResource from cuda.core._launch_config import LaunchConfig diff --git a/cuda_core/cuda/core/_context.pxd b/cuda_core/cuda/core/_context.pxd index 9e1a460f50f..5d3ed9bba2f 100644 --- a/cuda_core/cuda/core/_context.pxd +++ b/cuda_core/cuda/core/_context.pxd @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.core._resource_handles cimport ContextHandle +from cuda.core._resource_handles cimport ContextHandle, GreenCtxHandle cdef class Context: """Cython declaration for Context class. @@ -13,8 +13,15 @@ cdef class Context: cdef: ContextHandle _h_context + GreenCtxHandle _h_green_ctx int _device_id + bint _is_green object __weakref__ @staticmethod cdef Context _from_handle(type cls, ContextHandle h_context, int device_id) + + @staticmethod + cdef Context _from_green_ctx(type cls, GreenCtxHandle h_green_ctx, int device_id) + + cpdef close(self) diff --git a/cuda_core/cuda/core/_context.pyx b/cuda_core/cuda/core/_context.pyx index b2b21465c81..e9cffc10868 100644 --- a/cuda_core/cuda/core/_context.pyx +++ b/cuda_core/cuda/core/_context.pyx @@ -6,6 +6,8 @@ from dataclasses import dataclass from cuda.core._resource_handles cimport ( ContextHandle, + GreenCtxHandle, + get_green_ctx_context, as_intptr, as_py, ) @@ -30,6 +32,17 @@ cdef class Context: cdef Context ctx = cls.__new__(cls) ctx._h_context = h_context ctx._device_id = device_id + ctx._is_green = False + return ctx + + @staticmethod + cdef Context _from_green_ctx(type cls, GreenCtxHandle h_green_ctx, int device_id): + """Create Context from an owning green context handle.""" + cdef Context ctx = cls.__new__(cls) + ctx._h_green_ctx = h_green_ctx + ctx._h_context = get_green_ctx_context(h_green_ctx) + ctx._device_id = device_id + ctx._is_green = True return ctx @property @@ -43,6 +56,16 @@ cdef class Context: def _handle(self): return self.handle + @property + def is_green(self) -> bool: + """True if this context was created from device resources.""" + return bool(self._is_green) + + cpdef close(self): + """Release this context wrapper's underlying CUDA handles.""" + self._h_context.reset() + self._h_green_ctx.reset() + def __eq__(self, other): if not isinstance(other, Context): return NotImplemented @@ -60,6 +83,9 @@ cdef class Context: class ContextOptions: """Options for context creation. - Currently unused, reserved for future use. + Attributes + ---------- + resources : Sequence[SMResource | WorkqueueResource], optional + Device resources used to create a green context. """ - pass # TODO + resources: object = None diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index a21cd8a8aa5..749ad07847e 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -29,6 +29,10 @@ namespace cuda_core { decltype(&cuDevicePrimaryCtxRetain) p_cuDevicePrimaryCtxRetain = nullptr; decltype(&cuDevicePrimaryCtxRelease) p_cuDevicePrimaryCtxRelease = nullptr; decltype(&cuCtxGetCurrent) p_cuCtxGetCurrent = nullptr; +decltype(&cuGreenCtxCreate) p_cuGreenCtxCreate = nullptr; +decltype(&cuGreenCtxDestroy) p_cuGreenCtxDestroy = nullptr; +decltype(&cuCtxFromGreenCtx) p_cuCtxFromGreenCtx = nullptr; +decltype(&cuDevResourceGenerateDesc) p_cuDevResourceGenerateDesc = nullptr; decltype(&cuStreamCreateWithPriority) p_cuStreamCreateWithPriority = nullptr; decltype(&cuStreamDestroy) p_cuStreamDestroy = nullptr; @@ -224,6 +228,15 @@ namespace { struct ContextBox { CUcontext resource; }; + +struct GreenCtxBox { + CUgreenCtx resource; + CUcontext context; +}; + +struct DevResourceDescBox { + CUdevResourceDesc resource; +}; } // namespace ContextHandle create_context_handle_ref(CUcontext ctx) { @@ -231,6 +244,88 @@ ContextHandle create_context_handle_ref(CUcontext ctx) { return ContextHandle(box, &box->resource); } +static const GreenCtxBox* get_box(const GreenCtxHandle& h) noexcept { + const CUgreenCtx* p = h.get(); + return reinterpret_cast( + reinterpret_cast(p) - offsetof(GreenCtxBox, resource) + ); +} + +GreenCtxHandle create_green_ctx_handle(CUdevResourceDesc desc, CUdevice dev, unsigned int flags) { + if (!p_cuGreenCtxCreate || !p_cuCtxFromGreenCtx) { + err = CUDA_ERROR_NOT_SUPPORTED; + return {}; + } + + GILReleaseGuard gil; + CUgreenCtx green_ctx = nullptr; + CUcontext ctx = nullptr; + if (CUDA_SUCCESS != (err = p_cuGreenCtxCreate(&green_ctx, desc, dev, flags))) { + return {}; + } + if (CUDA_SUCCESS != (err = p_cuCtxFromGreenCtx(&ctx, green_ctx))) { + if (p_cuGreenCtxDestroy) { + p_cuGreenCtxDestroy(green_ctx); + } + return {}; + } + + auto box = std::shared_ptr( + new GreenCtxBox{green_ctx, ctx}, + [](const GreenCtxBox* b) { + GILReleaseGuard gil; + if (p_cuGreenCtxDestroy) { + p_cuGreenCtxDestroy(b->resource); + } + delete b; + } + ); + return GreenCtxHandle(box, &box->resource); +} + +GreenCtxHandle create_green_ctx_handle_ref(CUgreenCtx green_ctx) { + if (!green_ctx) { + return {}; + } + if (!p_cuCtxFromGreenCtx) { + err = CUDA_ERROR_NOT_SUPPORTED; + return {}; + } + + GILReleaseGuard gil; + CUcontext ctx = nullptr; + if (CUDA_SUCCESS != (err = p_cuCtxFromGreenCtx(&ctx, green_ctx))) { + return {}; + } + + auto box = std::make_shared(GreenCtxBox{green_ctx, ctx}); + return GreenCtxHandle(box, &box->resource); +} + +ContextHandle get_green_ctx_context(const GreenCtxHandle& h) noexcept { + if (!h) { + return {}; + } + const GreenCtxBox* box = get_box(h); + return ContextHandle(h, &box->context); +} + +DevResourceDescHandle create_dev_resource_desc_handle(CUdevResource* resources, unsigned int nbResources) { + if (!p_cuDevResourceGenerateDesc) { + err = CUDA_ERROR_NOT_SUPPORTED; + return {}; + } + + GILReleaseGuard gil; + CUdevResourceDesc desc = nullptr; + if (CUDA_SUCCESS != (err = p_cuDevResourceGenerateDesc(&desc, resources, nbResources))) { + return {}; + } + + auto box = std::make_shared(DevResourceDescBox{desc}); + return DevResourceDescHandle(box, &box->resource); +} + // Thread-local cache of primary contexts indexed by device ID static thread_local std::vector primary_context_cache; diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index d63fb869973..c312d05f570 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -59,6 +59,10 @@ void clear_last_error() noexcept; extern decltype(&cuDevicePrimaryCtxRetain) p_cuDevicePrimaryCtxRetain; extern decltype(&cuDevicePrimaryCtxRelease) p_cuDevicePrimaryCtxRelease; extern decltype(&cuCtxGetCurrent) p_cuCtxGetCurrent; +extern decltype(&cuGreenCtxCreate) p_cuGreenCtxCreate; +extern decltype(&cuGreenCtxDestroy) p_cuGreenCtxDestroy; +extern decltype(&cuCtxFromGreenCtx) p_cuCtxFromGreenCtx; +extern decltype(&cuDevResourceGenerateDesc) p_cuDevResourceGenerateDesc; extern decltype(&cuStreamCreateWithPriority) p_cuStreamCreateWithPriority; extern decltype(&cuStreamDestroy) p_cuStreamDestroy; @@ -142,6 +146,8 @@ extern NvJitLinkDestroyFn p_nvJitLinkDestroy; // ============================================================================ using ContextHandle = std::shared_ptr; +using GreenCtxHandle = std::shared_ptr; +using DevResourceDescHandle = std::shared_ptr; using StreamHandle = std::shared_ptr; using EventHandle = std::shared_ptr; using MemoryPoolHandle = std::shared_ptr; @@ -164,6 +170,21 @@ using FileDescriptorHandle = std::shared_ptr; // Function to create a non-owning context handle (references existing context). ContextHandle create_context_handle_ref(CUcontext ctx); +// Create an owning green context handle. The handle keeps the paired CUcontext +// returned by cuCtxFromGreenCtx in the same control block. +GreenCtxHandle create_green_ctx_handle(CUdevResourceDesc desc, CUdevice dev, unsigned int flags); + +// Create a non-owning green context handle. +GreenCtxHandle create_green_ctx_handle_ref(CUgreenCtx ctx); + +// Get the CUcontext paired with a green context handle. The returned handle +// shares ownership with the green context. +ContextHandle get_green_ctx_context(const GreenCtxHandle& h) noexcept; + +// Generate a descriptor for a resource list. CUDA exposes no explicit destroy +// API for CUdevResourceDesc; this handle only carries the opaque value. +DevResourceDescHandle create_dev_resource_desc_handle(CUdevResource* resources, unsigned int nbResources); + // Get handle to the primary context for a device (with thread-local caching) // Returns empty handle on error (caller must check) ContextHandle get_primary_context(int device_id); @@ -501,6 +522,14 @@ inline CUcontext as_cu(const ContextHandle& h) noexcept { return h ? *h : nullptr; } +inline CUgreenCtx as_cu(const GreenCtxHandle& h) noexcept { + return h ? *h : nullptr; +} + +inline CUdevResourceDesc as_cu(const DevResourceDescHandle& h) noexcept { + return h ? *h : nullptr; +} + inline CUstream as_cu(const StreamHandle& h) noexcept { return h ? *h : nullptr; } @@ -559,6 +588,14 @@ inline std::intptr_t as_intptr(const ContextHandle& h) noexcept { return reinterpret_cast(as_cu(h)); } +inline std::intptr_t as_intptr(const GreenCtxHandle& h) noexcept { + return reinterpret_cast(as_cu(h)); +} + +inline std::intptr_t as_intptr(const DevResourceDescHandle& h) noexcept { + return reinterpret_cast(as_cu(h)); +} + inline std::intptr_t as_intptr(const StreamHandle& h) noexcept { return reinterpret_cast(as_cu(h)); } @@ -649,6 +686,14 @@ inline PyObject* as_py(const ContextHandle& h) noexcept { return detail::make_py("cuda.bindings.driver", "CUcontext", as_intptr(h)); } +inline PyObject* as_py(const GreenCtxHandle& h) noexcept { + return detail::make_py("cuda.bindings.driver", "CUgreenCtx", as_intptr(h)); +} + +inline PyObject* as_py(const DevResourceDescHandle& h) noexcept { + return detail::make_py("cuda.bindings.driver", "CUdevResourceDesc", as_intptr(h)); +} + inline PyObject* as_py(const StreamHandle& h) noexcept { return detail::make_py("cuda.bindings.driver", "CUstream", as_intptr(h)); } diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index 1ea2df564c4..0abe80b7e01 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -8,18 +8,32 @@ cimport cpython from cuda.bindings cimport cydriver from cuda.core._utils.cuda_utils cimport HANDLE_RETURN +from libc.stdlib cimport free, malloc import threading from cuda.core._context cimport Context from cuda.core._context import ContextOptions +from cuda.core._device_resources cimport DeviceResources, SMResource, WorkqueueResource +from cuda.core._device_resources import ( + DeviceResources, + SMResource, + SMResourceOptions, + WorkqueueResource, + WorkqueueResourceOptions, +) from cuda.core._event cimport Event as cyEvent from cuda.core._event import Event, EventOptions from cuda.core._memory._buffer cimport Buffer, MemoryResource from cuda.core._resource_handles cimport ( ContextHandle, + DevResourceDescHandle, + GreenCtxHandle, create_context_handle_ref, + create_dev_resource_desc_handle, + create_green_ctx_handle, get_primary_context, + get_last_error, as_cu, ) @@ -954,7 +968,16 @@ class Device: Default value of `None` return the currently used device. """ - __slots__ = ("_device_id", "_memory_resource", "_has_inited", "_properties", "_uuid", "_context", "__weakref__") + __slots__ = ( + "_device_id", + "_memory_resource", + "_has_inited", + "_properties", + "_resources", + "_uuid", + "_context", + "__weakref__", + ) def __new__(cls, device_id: Device | int | None = None): if isinstance(device_id, Device): @@ -1100,6 +1123,13 @@ class Device: return self._properties + @property + def resources(self) -> DeviceResources: + """Return the hardware resource query namespace for this device.""" + if self._resources is None: + self._resources = DeviceResources._init(self._device_id) + return self._resources + @property def compute_capability(self) -> ComputeCapability: """Return a named tuple with 2 fields: major and minor.""" @@ -1219,6 +1249,7 @@ class Device: """ cdef ContextHandle h_context cdef cydriver.CUcontext prev_ctx, curr_ctx + cdef Context prev_owned = None if ctx is not None: # TODO: revisit once Context is cythonized @@ -1228,6 +1259,8 @@ class Device: "the provided context was created on the device with" f" id={ctx._device_id}, which is different from the target id={self._device_id}" ) + if self._has_inited and self._context is not None: + prev_owned = self._context # prev_ctx is the previous context curr_ctx = as_cu(ctx._h_context) prev_ctx = NULL @@ -1237,6 +1270,8 @@ class Device: self._has_inited = True self._context = ctx # Store owning context reference if prev_ctx != NULL: + if prev_owned is not None and as_cu(prev_owned._h_context) == prev_ctx: + return prev_owned return Context._from_handle(Context, create_context_handle_ref(prev_ctx), self._device_id) else: # use primary ctx @@ -1266,7 +1301,68 @@ class Device: Newly created context object. """ - raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") + cdef int n_resources + cdef int i + cdef object resources + cdef object res + cdef SMResource sm_res + cdef WorkqueueResource wq_res + cdef cydriver.CUdevResource* c_resources = NULL + cdef DevResourceDescHandle h_desc + cdef GreenCtxHandle h_green + + if options is None: + raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") + + assert_type(options, ContextOptions) + if options.resources is None: + raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") + + resources = options.resources + if isinstance(resources, (SMResource, WorkqueueResource)): + resources = (resources,) + else: + resources = tuple(resources) + if len(resources) == 0: + raise ValueError("ContextOptions.resources must not be empty") + + n_resources = len(resources) + c_resources = malloc( + n_resources * sizeof(cydriver.CUdevResource) + ) + if c_resources == NULL: + raise MemoryError() + + try: + for i, res in enumerate(resources): + if isinstance(res, SMResource): + sm_res = res + if not sm_res._is_usable: + raise ValueError("dry-run SMResource objects cannot be used to create a context") + c_resources[i] = sm_res._resource + elif isinstance(res, WorkqueueResource): + wq_res = res + c_resources[i] = wq_res._wq_config_resource + else: + raise TypeError(f"Unsupported context resource type: {type(res)}") + + h_desc = create_dev_resource_desc_handle(c_resources, n_resources) + if h_desc.get() == NULL: + HANDLE_RETURN(get_last_error()) + raise RuntimeError("Failed to generate CUDA device resource descriptor") + + h_green = create_green_ctx_handle( + as_cu(h_desc), + self._device_id, + cydriver.CUgreenCtxCreate_flags.CU_GREEN_CTX_DEFAULT_STREAM, + ) + if h_green.get() == NULL: + HANDLE_RETURN(get_last_error()) + raise RuntimeError("Failed to create CUDA green context") + + return Context._from_green_ctx(Context, h_green, self._device_id) + finally: + free(c_resources) def create_stream(self, obj: IsStreamT | None = None, options: StreamOptions | None = None) -> Stream: """Create a Stream object. @@ -1429,6 +1525,7 @@ cdef inline list Device_ensure_tls_devices(cls): device._memory_resource = None device._has_inited = False device._properties = None + device._resources = None device._uuid = None device._context = None devices.append(device) diff --git a/cuda_core/cuda/core/_device_resources.pxd b/cuda_core/cuda/core/_device_resources.pxd new file mode 100644 index 00000000000..0e614562d00 --- /dev/null +++ b/cuda_core/cuda/core/_device_resources.pxd @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver + + +cdef class SMResource: + cdef: + cydriver.CUdevResource _resource + bint _is_usable + object __weakref__ + + @staticmethod + cdef SMResource _from_dev_resource(cydriver.CUdevResource res) + + @staticmethod + cdef SMResource _from_dry_run_resource(cydriver.CUdevResource res) + + +cdef class WorkqueueResource: + cdef: + cydriver.CUdevResource _wq_config_resource + cydriver.CUdevResource _wq_resource + object __weakref__ + + @staticmethod + cdef WorkqueueResource _from_dev_resources( + cydriver.CUdevResource wq_config, + cydriver.CUdevResource wq, + ) + + +cdef class DeviceResources: + cdef: + int _device_id + object __weakref__ + + @staticmethod + cdef DeviceResources _init(int device_id) diff --git a/cuda_core/cuda/core/_device_resources.pyx b/cuda_core/cuda/core/_device_resources.pyx new file mode 100644 index 00000000000..290d01977e2 --- /dev/null +++ b/cuda_core/cuda/core/_device_resources.pyx @@ -0,0 +1,400 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from collections.abc import Sequence as SequenceABC +from dataclasses import dataclass + +from libc.stdint cimport intptr_t +from libc.stdlib cimport free, malloc +from libc.string cimport memset + +from cuda.bindings cimport cydriver +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core._utils.version cimport cy_binding_version, cy_driver_version + + +__all__ = [ + "DeviceResources", + "SMResource", + "SMResourceOptions", + "WorkqueueResource", + "WorkqueueResourceOptions", +] + + +cdef inline void _check_green_ctx_support() except *: + cdef tuple drv = cy_driver_version() + cdef tuple bind = cy_binding_version() + if drv < (12, 4, 0): + raise NotImplementedError( + "Green context support requires CUDA driver 12.4 or newer. " + f"Using driver version {'.'.join(map(str, drv))}" + ) + if bind < (12, 4, 0): + raise NotImplementedError( + "Green context support requires cuda.bindings 12.4 or newer. " + f"Using cuda.bindings version {'.'.join(map(str, bind))}" + ) + + +cdef inline void _check_sm_split_support() except *: + cdef tuple drv = cy_driver_version() + cdef tuple bind = cy_binding_version() + if drv < (13, 1, 0): + raise NotImplementedError( + "SMResource.split() requires CUDA driver 13.1 or newer. " + f"Using driver version {'.'.join(map(str, drv))}" + ) + if bind < (13, 1, 0): + raise NotImplementedError( + "SMResource.split() requires cuda.bindings 13.1 or newer. " + f"Using cuda.bindings version {'.'.join(map(str, bind))}" + ) + + +cdef inline void _check_workqueue_support() except *: + cdef tuple drv = cy_driver_version() + cdef tuple bind = cy_binding_version() + if drv < (13, 1, 0): + raise NotImplementedError( + "WorkqueueResource requires CUDA driver 13.1 or newer. " + f"Using driver version {'.'.join(map(str, drv))}" + ) + if bind < (13, 1, 0): + raise NotImplementedError( + "WorkqueueResource requires cuda.bindings 13.1 or newer. " + f"Using cuda.bindings version {'.'.join(map(str, bind))}" + ) + + +@dataclass +class SMResourceOptions: + """Options for :meth:`SMResource.split`. + + ``count`` determines the number of requested groups. Scalar ``count`` or + ``None`` creates one group; a sequence creates ``len(count)`` groups. Other + sequence fields must match the length of ``count``. + """ + + count: int | SequenceABC | None = None + min_count: int | SequenceABC | None = None + coscheduled_sm_count: int | SequenceABC | None = None + preferred_coscheduled_sm_count: int | SequenceABC | None = None + + +@dataclass +class WorkqueueResourceOptions: + """Options for :meth:`WorkqueueResource.configure`.""" + + sharing_scope: str | None = None + + +cdef inline bint _is_sequence(object value): + return ( + isinstance(value, SequenceABC) + and not isinstance(value, (str, bytes, bytearray)) + ) + + +cdef int _resolve_group_count(object options) except -1: + cdef object count = options.count + cdef int n_groups + cdef object value + cdef str field_name + + if count is None or isinstance(count, int): + n_groups = 1 + elif _is_sequence(count): + n_groups = len(count) + if n_groups == 0: + raise ValueError("count sequence must not be empty") + else: + raise TypeError(f"count must be int, Sequence, or None, got {type(count)}") + + if n_groups == 1: + for field_name in ( + "min_count", + "coscheduled_sm_count", + "preferred_coscheduled_sm_count", + ): + value = getattr(options, field_name) + if _is_sequence(value): + raise ValueError( + f"{field_name} is a Sequence but count is scalar; " + "count must be a Sequence to specify multiple groups" + ) + else: + for field_name in ( + "min_count", + "coscheduled_sm_count", + "preferred_coscheduled_sm_count", + ): + value = getattr(options, field_name) + if _is_sequence(value) and len(value) != n_groups: + raise ValueError( + f"{field_name} has length {len(value)}, expected {n_groups} " + "(must match count)" + ) + return n_groups + + +cdef object _broadcast_field(object value, int n_groups): + if _is_sequence(value): + return list(value) + return [value] * n_groups + + +IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef void _fill_group_params( + cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS* params, + int n_groups, + object options, + ) except *: + cdef list counts = _broadcast_field(options.count, n_groups) + cdef list coscheduled = _broadcast_field(options.coscheduled_sm_count, n_groups) + cdef list preferred = _broadcast_field(options.preferred_coscheduled_sm_count, n_groups) + cdef int i + + # v1.0 intentionally does not expose min_count: cuDevSmResourceSplit's + # structured API uses smCount as the per-group requested count. + for i in range(n_groups): + memset(¶ms[i], 0, sizeof(cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS)) + params[i].smCount = 0 if counts[i] is None else counts[i] + if coscheduled[i] is not None: + params[i].coscheduledSmCount = coscheduled[i] + if preferred[i] is not None: + params[i].preferredCoscheduledSmCount = preferred[i] + params[i].flags = 0 + + + cdef object _split_with_general_api(SMResource sm, object options, bint dry_run): + cdef int n_groups = _resolve_group_count(options) + cdef cydriver.CUdevResource* result = NULL + cdef cydriver.CUdevResource remaining + cdef cydriver.CUdevResource synth + cdef cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS* params = NULL + cdef list groups = [] + cdef int i + + params = malloc( + n_groups * sizeof(cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS) + ) + if params == NULL: + raise MemoryError() + + try: + _fill_group_params(params, n_groups, options) + + if not dry_run: + result = malloc( + n_groups * sizeof(cydriver.CUdevResource) + ) + if result == NULL: + raise MemoryError() + + memset(&remaining, 0, sizeof(cydriver.CUdevResource)) + with nogil: + HANDLE_RETURN(cydriver.cuDevSmResourceSplit( + result, + n_groups, + &sm._resource, + &remaining, + 0, + params, + )) + + if result != NULL: + for i in range(n_groups): + groups.append(SMResource._from_dev_resource(result[i])) + return groups, SMResource._from_dev_resource(remaining) + + for i in range(n_groups): + memset(&synth, 0, sizeof(cydriver.CUdevResource)) + synth.type = cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM + synth.sm.smCount = params[i].smCount + groups.append(SMResource._from_dry_run_resource(synth)) + return groups, SMResource._from_dry_run_resource(remaining) + finally: + if params != NULL: + free(params) + if result != NULL: + free(result) +ELSE: + cdef object _split_with_general_api(SMResource sm, object options, bint dry_run): + raise NotImplementedError( + "SMResource.split() requires cuda.core to be built with CUDA 13.x bindings" + ) + + +cdef class SMResource: + """SM resource queried from a device. Not user-constructible.""" + + def __init__(self, *args, **kwargs): + raise RuntimeError( + "SMResource cannot be instantiated directly. " + "Use dev.resources.sm or SMResource.split()." + ) + + @staticmethod + cdef SMResource _from_dev_resource(cydriver.CUdevResource res): + cdef SMResource self = SMResource.__new__(SMResource) + self._resource = res + self._is_usable = True + return self + + @staticmethod + cdef SMResource _from_dry_run_resource(cydriver.CUdevResource res): + cdef SMResource self = SMResource.__new__(SMResource) + self._resource = res + self._is_usable = False + return self + + @property + def handle(self) -> int: + """Return the address of the underlying ``CUdevResource`` struct.""" + return &self._resource + + @property + def sm_count(self) -> int: + """Total SMs available in this resource.""" + return self._resource.sm.smCount + + @property + def min_partition_size(self) -> int: + """Minimum SM count required to create a partition.""" + return self._resource.sm.minSmPartitionSize + + @property + def coscheduled_alignment(self) -> int: + """Number of SMs guaranteed to be co-scheduled.""" + return self._resource.sm.smCoscheduledAlignment + + @property + def flags(self) -> int: + """Raw flags from the underlying SM resource.""" + return self._resource.sm.flags + + def split(self, options not None, *, bint dry_run=False): + """Split this SM resource into groups plus a remainder.""" + if not isinstance(options, SMResourceOptions): + raise TypeError(f"options must be SMResourceOptions, got {type(options)}") + if options.min_count is not None: + raise NotImplementedError( + "SMResourceOptions.min_count is reserved for future use; " + "use count to request SMs in the v1.0 structured split API" + ) + _resolve_group_count(options) + _check_green_ctx_support() + _check_sm_split_support() + return _split_with_general_api(self, options, dry_run) + + +cdef class WorkqueueResource: + """Workqueue resource. Not user-constructible.""" + + def __init__(self, *args, **kwargs): + raise RuntimeError( + "WorkqueueResource cannot be instantiated directly. " + "Use dev.resources.workqueue." + ) + + @staticmethod + cdef WorkqueueResource _from_dev_resources( + cydriver.CUdevResource wq_config, + cydriver.CUdevResource wq, + ): + cdef WorkqueueResource self = WorkqueueResource.__new__(WorkqueueResource) + self._wq_config_resource = wq_config + self._wq_resource = wq + return self + + @property + def handle(self) -> int: + """Return the address of the underlying config ``CUdevResource`` struct.""" + return &self._wq_config_resource + + def configure(self, options not None): + """Configure the workqueue resource in place.""" + _check_green_ctx_support() + _check_workqueue_support() + if not isinstance(options, WorkqueueResourceOptions): + raise TypeError(f"options must be WorkqueueResourceOptions, got {type(options)}") + if options.sharing_scope is None: + return None + + IF CUDA_CORE_BUILD_MAJOR >= 13: + if options.sharing_scope == "device_ctx": + self._wq_config_resource.wqConfig.sharingScope = ( + cydriver.CUdevWorkqueueConfigScope.CU_WORKQUEUE_SCOPE_DEVICE_CTX + ) + elif options.sharing_scope == "green_ctx_balanced": + self._wq_config_resource.wqConfig.sharingScope = ( + cydriver.CUdevWorkqueueConfigScope.CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED + ) + else: + raise ValueError( + f"Unknown sharing_scope: {options.sharing_scope!r}. " + "Expected 'device_ctx' or 'green_ctx_balanced'." + ) + ELSE: + raise NotImplementedError( + "WorkqueueResource requires cuda.core to be built with CUDA 13.x bindings" + ) + + +cdef class DeviceResources: + """Namespace for hardware resource query. Not user-constructible.""" + + def __init__(self, *args, **kwargs): + raise RuntimeError( + "DeviceResources cannot be instantiated directly. " + "Use dev.resources." + ) + + @staticmethod + cdef DeviceResources _init(int device_id): + cdef DeviceResources self = DeviceResources.__new__(DeviceResources) + self._device_id = device_id + return self + + @property + def sm(self) -> SMResource: + """Query SM resources from this device.""" + _check_green_ctx_support() + cdef cydriver.CUdevResource res + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetDevResource( + self._device_id, + &res, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM, + )) + return SMResource._from_dev_resource(res) + + @property + def workqueue(self) -> WorkqueueResource: + """Query workqueue resources from this device.""" + _check_green_ctx_support() + _check_workqueue_support() + cdef cydriver.CUdevResource wq_config + cdef cydriver.CUdevResource wq + + IF CUDA_CORE_BUILD_MAJOR >= 13: + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetDevResource( + self._device_id, + &wq_config, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG, + )) + HANDLE_RETURN(cydriver.cuDeviceGetDevResource( + self._device_id, + &wq, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE, + )) + return WorkqueueResource._from_dev_resources(wq_config, wq) + ELSE: + raise NotImplementedError( + "WorkqueueResource requires cuda.core to be built with CUDA 13.x bindings" + ) diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 9e7307e821b..5378b6a2047 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -20,6 +20,8 @@ from cuda.bindings cimport cynvjitlink cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Handle types ctypedef shared_ptr[const cydriver.CUcontext] ContextHandle + ctypedef shared_ptr[const cydriver.CUgreenCtx] GreenCtxHandle + ctypedef shared_ptr[const cydriver.CUdevResourceDesc] DevResourceDescHandle ctypedef shared_ptr[const cydriver.CUstream] StreamHandle ctypedef shared_ptr[const cydriver.CUevent] EventHandle ctypedef shared_ptr[const cydriver.CUmemoryPool] MemoryPoolHandle @@ -45,6 +47,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # as_cu() - extract the raw CUDA handle (inline C++) cydriver.CUcontext as_cu(ContextHandle h) noexcept nogil + cydriver.CUgreenCtx as_cu(GreenCtxHandle h) noexcept nogil + cydriver.CUdevResourceDesc as_cu(DevResourceDescHandle h) noexcept nogil cydriver.CUstream as_cu(StreamHandle h) noexcept nogil cydriver.CUevent as_cu(EventHandle h) noexcept nogil cydriver.CUmemoryPool as_cu(MemoryPoolHandle h) noexcept nogil @@ -61,6 +65,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # as_intptr() - extract handle as intptr_t for Python interop (inline C++) intptr_t as_intptr(ContextHandle h) noexcept nogil + intptr_t as_intptr(GreenCtxHandle h) noexcept nogil + intptr_t as_intptr(DevResourceDescHandle h) noexcept nogil intptr_t as_intptr(StreamHandle h) noexcept nogil intptr_t as_intptr(EventHandle h) noexcept nogil intptr_t as_intptr(MemoryPoolHandle h) noexcept nogil @@ -78,6 +84,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # as_py() - convert handle to Python wrapper object (inline C++; requires GIL) object as_py(ContextHandle h) + object as_py(GreenCtxHandle h) + object as_py(DevResourceDescHandle h) object as_py(StreamHandle h) object as_py(EventHandle h) object as_py(MemoryPoolHandle h) @@ -107,6 +115,12 @@ cdef void clear_last_error() noexcept nogil # Context handles cdef ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) except+ nogil +cdef GreenCtxHandle create_green_ctx_handle( + cydriver.CUdevResourceDesc desc, cydriver.CUdevice dev, unsigned int flags) except+ nogil +cdef GreenCtxHandle create_green_ctx_handle_ref(cydriver.CUgreenCtx ctx) except+ nogil +cdef ContextHandle get_green_ctx_context(const GreenCtxHandle& h) noexcept nogil +cdef DevResourceDescHandle create_dev_resource_desc_handle( + cydriver.CUdevResource* resources, unsigned int nbResources) except+ nogil cdef ContextHandle get_primary_context(int device_id) except+ nogil cdef ContextHandle get_current_context() except+ nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 2090f5026d0..1a944999fe4 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -20,6 +20,8 @@ from cuda.bindings cimport cynvjitlink from ._resource_handles cimport ( ContextHandle, + GreenCtxHandle, + DevResourceDescHandle, StreamHandle, EventHandle, MemoryPoolHandle, @@ -55,6 +57,14 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Context handles ContextHandle create_context_handle_ref "cuda_core::create_context_handle_ref" ( cydriver.CUcontext ctx) except+ nogil + GreenCtxHandle create_green_ctx_handle "cuda_core::create_green_ctx_handle" ( + cydriver.CUdevResourceDesc desc, cydriver.CUdevice dev, unsigned int flags) except+ nogil + GreenCtxHandle create_green_ctx_handle_ref "cuda_core::create_green_ctx_handle_ref" ( + cydriver.CUgreenCtx ctx) except+ nogil + ContextHandle get_green_ctx_context "cuda_core::get_green_ctx_context" ( + const GreenCtxHandle& h) noexcept nogil + DevResourceDescHandle create_dev_resource_desc_handle "cuda_core::create_dev_resource_desc_handle" ( + cydriver.CUdevResource* resources, unsigned int nbResources) except+ nogil ContextHandle get_primary_context "cuda_core::get_primary_context" ( int device_id) except+ nogil ContextHandle get_current_context "cuda_core::get_current_context" () except+ nogil @@ -223,6 +233,10 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": void* p_cuDevicePrimaryCtxRetain "reinterpret_cast(cuda_core::p_cuDevicePrimaryCtxRetain)" void* p_cuDevicePrimaryCtxRelease "reinterpret_cast(cuda_core::p_cuDevicePrimaryCtxRelease)" void* p_cuCtxGetCurrent "reinterpret_cast(cuda_core::p_cuCtxGetCurrent)" + void* p_cuGreenCtxCreate "reinterpret_cast(cuda_core::p_cuGreenCtxCreate)" + void* p_cuGreenCtxDestroy "reinterpret_cast(cuda_core::p_cuGreenCtxDestroy)" + void* p_cuCtxFromGreenCtx "reinterpret_cast(cuda_core::p_cuCtxFromGreenCtx)" + void* p_cuDevResourceGenerateDesc "reinterpret_cast(cuda_core::p_cuDevResourceGenerateDesc)" # Stream void* p_cuStreamCreateWithPriority "reinterpret_cast(cuda_core::p_cuStreamCreateWithPriority)" @@ -288,10 +302,22 @@ cdef void* _get_driver_fn(str name): capsule = cydriver.__pyx_capi__[name] return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule)) + +cdef void* _get_optional_driver_fn(str name): + try: + capsule = cydriver.__pyx_capi__[name] + except KeyError: + return NULL + return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule)) + # Context p_cuDevicePrimaryCtxRetain = _get_driver_fn("cuDevicePrimaryCtxRetain") p_cuDevicePrimaryCtxRelease = _get_driver_fn("cuDevicePrimaryCtxRelease") p_cuCtxGetCurrent = _get_driver_fn("cuCtxGetCurrent") +p_cuGreenCtxCreate = _get_optional_driver_fn("cuGreenCtxCreate") +p_cuGreenCtxDestroy = _get_optional_driver_fn("cuGreenCtxDestroy") +p_cuCtxFromGreenCtx = _get_optional_driver_fn("cuCtxFromGreenCtx") +p_cuDevResourceGenerateDesc = _get_optional_driver_fn("cuDevResourceGenerateDesc") # Stream p_cuStreamCreateWithPriority = _get_driver_fn("cuStreamCreateWithPriority") diff --git a/cuda_core/tests/test_green_context.py b/cuda_core/tests/test_green_context.py new file mode 100644 index 00000000000..7dbdc19d261 --- /dev/null +++ b/cuda_core/tests/test_green_context.py @@ -0,0 +1,123 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from cuda.core import ( + ContextOptions, + DeviceResources, + SMResource, + SMResourceOptions, + WorkqueueResource, + WorkqueueResourceOptions, +) +from cuda.core._utils.cuda_utils import CUDAError + + +def _sm_resource_or_skip(dev): + try: + return dev.resources.sm + except (NotImplementedError, CUDAError) as exc: + pytest.skip(str(exc)) + + +def _split_or_skip(sm, options, **kwargs): + try: + return sm.split(options, **kwargs) + except (NotImplementedError, CUDAError) as exc: + pytest.skip(str(exc)) + + +def _green_context_or_skip(dev): + sm = _sm_resource_or_skip(dev) + groups, _ = _split_or_skip(sm, SMResourceOptions(count=None)) + try: + return dev.create_context(ContextOptions(resources=[groups[0]])) + except CUDAError as exc: + pytest.skip(str(exc)) + + +def test_not_user_constructible(): + with pytest.raises(RuntimeError): + DeviceResources() + with pytest.raises(RuntimeError): + SMResource() + with pytest.raises(RuntimeError): + WorkqueueResource() + + +def test_create_context_without_resources_stays_unimplemented(init_cuda): + with pytest.raises(NotImplementedError): + init_cuda.create_context() + with pytest.raises(NotImplementedError): + init_cuda.create_context(ContextOptions(resources=None)) + with pytest.raises(TypeError): + init_cuda.create_context(object()) + + +def test_sm_resource_query(init_cuda): + sm = _sm_resource_or_skip(init_cuda) + + assert sm.handle != 0 + assert sm.sm_count > 0 + assert sm.min_partition_size > 0 + assert sm.coscheduled_alignment > 0 + assert isinstance(sm.flags, int) + assert not hasattr(sm, "memory_node_id") + + +def test_workqueue_resource_query_and_configure(init_cuda): + try: + wq = init_cuda.resources.workqueue + except (NotImplementedError, CUDAError) as exc: + pytest.skip(str(exc)) + + assert wq.handle != 0 + assert wq.configure(WorkqueueResourceOptions(sharing_scope=None)) is None + assert wq.configure(WorkqueueResourceOptions(sharing_scope="green_ctx_balanced")) is None + with pytest.raises(ValueError, match="Unknown sharing_scope"): + wq.configure(WorkqueueResourceOptions(sharing_scope="bogus")) + + +def test_sm_resource_split_validation(init_cuda): + sm = _sm_resource_or_skip(init_cuda) + + with pytest.raises(ValueError, match="count is scalar"): + sm.split(SMResourceOptions(count=4, coscheduled_sm_count=(2, 2))) + + with pytest.raises(ValueError, match="expected 2"): + sm.split(SMResourceOptions(count=(4, 4), coscheduled_sm_count=(2, 2, 2))) + + with pytest.raises(NotImplementedError, match="min_count"): + sm.split(SMResourceOptions(count=4, min_count=2)) + + +def test_sm_resource_split_dry_run_cannot_create_context(init_cuda): + sm = _sm_resource_or_skip(init_cuda) + groups, _ = _split_or_skip(sm, SMResourceOptions(count=None), dry_run=True) + + assert len(groups) == 1 + with pytest.raises(ValueError, match="dry-run SMResource"): + init_cuda.create_context(ContextOptions(resources=[groups[0]])) + + +def test_create_green_context(init_cuda): + ctx = _green_context_or_skip(init_cuda) + + assert ctx.is_green + assert ctx.handle is not None + ctx.close() + + +def test_set_current_swap_preserves_green_context(init_cuda): + dev = init_cuda + green_ctx = _green_context_or_skip(dev) + + prev = dev.set_current(green_ctx) + assert prev is not None + + restored = dev.set_current(prev) + assert restored is green_ctx + assert restored.is_green + restored.close() From b33c381e1fe11e08febb595dda5dae4462fa40de Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 24 Apr 2026 23:34:25 +0000 Subject: [PATCH 02/11] Refine green context split compatibility --- cuda_core/cuda/core/_context.pyx | 12 ++ cuda_core/cuda/core/_device_resources.pyx | 142 +++++++++++++++++----- cuda_core/tests/test_green_context.py | 41 ++++++- 3 files changed, 161 insertions(+), 34 deletions(-) diff --git a/cuda_core/cuda/core/_context.pyx b/cuda_core/cuda/core/_context.pyx index e9cffc10868..f6facb4284f 100644 --- a/cuda_core/cuda/core/_context.pyx +++ b/cuda_core/cuda/core/_context.pyx @@ -4,13 +4,16 @@ from dataclasses import dataclass +from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport ( ContextHandle, GreenCtxHandle, + as_cu, get_green_ctx_context, as_intptr, as_py, ) +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN __all__ = ['Context', 'ContextOptions'] @@ -63,6 +66,15 @@ cdef class Context: cpdef close(self): """Release this context wrapper's underlying CUDA handles.""" + cdef cydriver.CUcontext current_ctx + if self._h_context.get() != NULL: + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(¤t_ctx)) + if current_ctx == as_cu(self._h_context): + raise RuntimeError( + "Cannot close a CUDA context while it is current. " + "Restore a previous context before closing this context." + ) self._h_context.reset() self._h_green_ctx.reset() diff --git a/cuda_core/cuda/core/_device_resources.pyx b/cuda_core/cuda/core/_device_resources.pyx index 290d01977e2..1258e95e10c 100644 --- a/cuda_core/cuda/core/_device_resources.pyx +++ b/cuda_core/cuda/core/_device_resources.pyx @@ -40,21 +40,6 @@ cdef inline void _check_green_ctx_support() except *: ) -cdef inline void _check_sm_split_support() except *: - cdef tuple drv = cy_driver_version() - cdef tuple bind = cy_binding_version() - if drv < (13, 1, 0): - raise NotImplementedError( - "SMResource.split() requires CUDA driver 13.1 or newer. " - f"Using driver version {'.'.join(map(str, drv))}" - ) - if bind < (13, 1, 0): - raise NotImplementedError( - "SMResource.split() requires cuda.bindings 13.1 or newer. " - f"Using cuda.bindings version {'.'.join(map(str, bind))}" - ) - - cdef inline void _check_workqueue_support() except *: cdef tuple drv = cy_driver_version() cdef tuple bind = cy_binding_version() @@ -80,7 +65,6 @@ class SMResourceOptions: """ count: int | SequenceABC | None = None - min_count: int | SequenceABC | None = None coscheduled_sm_count: int | SequenceABC | None = None preferred_coscheduled_sm_count: int | SequenceABC | None = None @@ -116,7 +100,6 @@ cdef int _resolve_group_count(object options) except -1: if n_groups == 1: for field_name in ( - "min_count", "coscheduled_sm_count", "preferred_coscheduled_sm_count", ): @@ -128,7 +111,6 @@ cdef int _resolve_group_count(object options) except -1: ) else: for field_name in ( - "min_count", "coscheduled_sm_count", "preferred_coscheduled_sm_count", ): @@ -147,6 +129,71 @@ cdef object _broadcast_field(object value, int n_groups): return [value] * n_groups +cdef inline unsigned int _as_uint(object value, str field_name) except? 0: + if not isinstance(value, int): + raise TypeError(f"{field_name} must be an int or None, got {type(value)}") + if value < 0: + raise ValueError(f"{field_name} must be non-negative") + return value + + +cdef inline unsigned int _count_to_sm_count(object value) except? 0: + if value is None: + return 0 + return _as_uint(value, "count") + + +cdef inline bint _can_use_structured_sm_split(): + IF CUDA_CORE_BUILD_MAJOR >= 13: + return cy_driver_version() >= (13, 1, 0) and cy_binding_version() >= (13, 1, 0) + ELSE: + return False + + +cdef inline void _check_split_by_count_support() except *: + cdef tuple drv = cy_driver_version() + cdef tuple bind = cy_binding_version() + if drv < (12, 4, 0): + raise NotImplementedError( + "SMResource.split() requires CUDA driver 12.4 or newer. " + f"Using driver version {'.'.join(map(str, drv))}" + ) + if bind < (12, 4, 0): + raise NotImplementedError( + "SMResource.split() requires cuda.bindings 12.4 or newer. " + f"Using cuda.bindings version {'.'.join(map(str, bind))}" + ) + + +cdef object _resolve_split_by_count_request(object options): + cdef int n_groups = _resolve_group_count(options) + cdef list counts = _broadcast_field(options.count, n_groups) + cdef object first = counts[0] + cdef object value + cdef unsigned int min_count + + if options.coscheduled_sm_count is not None: + raise NotImplementedError( + "SMResourceOptions.coscheduled_sm_count requires the CUDA 13.1 " + "structured SM split API" + ) + if options.preferred_coscheduled_sm_count is not None: + raise NotImplementedError( + "SMResourceOptions.preferred_coscheduled_sm_count requires the " + "CUDA 13.1 structured SM split API" + ) + + for value in counts[1:]: + if value != first: + raise NotImplementedError( + "CUDA 12 SM splitting only supports homogeneous count values; " + "use CUDA 13.1 or newer for per-group counts" + ) + + min_count = _count_to_sm_count(first) + return n_groups, min_count + + IF CUDA_CORE_BUILD_MAJOR >= 13: cdef void _fill_group_params( cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS* params, @@ -158,15 +205,15 @@ IF CUDA_CORE_BUILD_MAJOR >= 13: cdef list preferred = _broadcast_field(options.preferred_coscheduled_sm_count, n_groups) cdef int i - # v1.0 intentionally does not expose min_count: cuDevSmResourceSplit's - # structured API uses smCount as the per-group requested count. for i in range(n_groups): memset(¶ms[i], 0, sizeof(cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS)) - params[i].smCount = 0 if counts[i] is None else counts[i] + params[i].smCount = _count_to_sm_count(counts[i]) if coscheduled[i] is not None: - params[i].coscheduledSmCount = coscheduled[i] + params[i].coscheduledSmCount = _as_uint(coscheduled[i], "coscheduled_sm_count") if preferred[i] is not None: - params[i].preferredCoscheduledSmCount = preferred[i] + params[i].preferredCoscheduledSmCount = _as_uint( + preferred[i], "preferred_coscheduled_sm_count" + ) params[i].flags = 0 @@ -229,6 +276,44 @@ ELSE: ) +cdef object _split_with_count_api(SMResource sm, object options, bint dry_run): + cdef object request = _resolve_split_by_count_request(options) + cdef unsigned int nb_groups = request[0] + cdef unsigned int min_count = request[1] + cdef unsigned int actual_groups = nb_groups + cdef cydriver.CUdevResource* result = NULL + cdef cydriver.CUdevResource remaining + cdef list groups = [] + cdef int i + + result = malloc(nb_groups * sizeof(cydriver.CUdevResource)) + if result == NULL: + raise MemoryError() + + try: + memset(&remaining, 0, sizeof(cydriver.CUdevResource)) + with nogil: + HANDLE_RETURN(cydriver.cuDevSmResourceSplitByCount( + result, + &actual_groups, + &sm._resource, + &remaining, + 0, + min_count, + )) + + for i in range(actual_groups): + if dry_run: + groups.append(SMResource._from_dry_run_resource(result[i])) + else: + groups.append(SMResource._from_dev_resource(result[i])) + if dry_run: + return groups, SMResource._from_dry_run_resource(remaining) + return groups, SMResource._from_dev_resource(remaining) + finally: + free(result) + + cdef class SMResource: """SM resource queried from a device. Not user-constructible.""" @@ -281,15 +366,12 @@ cdef class SMResource: """Split this SM resource into groups plus a remainder.""" if not isinstance(options, SMResourceOptions): raise TypeError(f"options must be SMResourceOptions, got {type(options)}") - if options.min_count is not None: - raise NotImplementedError( - "SMResourceOptions.min_count is reserved for future use; " - "use count to request SMs in the v1.0 structured split API" - ) _resolve_group_count(options) _check_green_ctx_support() - _check_sm_split_support() - return _split_with_general_api(self, options, dry_run) + if _can_use_structured_sm_split(): + return _split_with_general_api(self, options, dry_run) + _check_split_by_count_support() + return _split_with_count_api(self, options, dry_run) cdef class WorkqueueResource: diff --git a/cuda_core/tests/test_green_context.py b/cuda_core/tests/test_green_context.py index 7dbdc19d261..a287383c404 100644 --- a/cuda_core/tests/test_green_context.py +++ b/cuda_core/tests/test_green_context.py @@ -82,15 +82,16 @@ def test_workqueue_resource_query_and_configure(init_cuda): def test_sm_resource_split_validation(init_cuda): sm = _sm_resource_or_skip(init_cuda) + count = sm.min_partition_size with pytest.raises(ValueError, match="count is scalar"): - sm.split(SMResourceOptions(count=4, coscheduled_sm_count=(2, 2))) + sm.split(SMResourceOptions(count=count, coscheduled_sm_count=(count, count))) with pytest.raises(ValueError, match="expected 2"): - sm.split(SMResourceOptions(count=(4, 4), coscheduled_sm_count=(2, 2, 2))) + sm.split(SMResourceOptions(count=(count, count), coscheduled_sm_count=(count, count, count))) - with pytest.raises(NotImplementedError, match="min_count"): - sm.split(SMResourceOptions(count=4, min_count=2)) + with pytest.raises(ValueError, match="count must be non-negative"): + sm.split(SMResourceOptions(count=-1)) def test_sm_resource_split_dry_run_cannot_create_context(init_cuda): @@ -121,3 +122,35 @@ def test_set_current_swap_preserves_green_context(init_cuda): assert restored is green_ctx assert restored.is_green restored.close() + + +def test_green_context_push_model_creates_stream_and_event(init_cuda): + dev = init_cuda + green_ctx = _green_context_or_skip(dev) + + prev = dev.set_current(green_ctx) + try: + stream = dev.create_stream() + event = stream.record() + stream.sync() + event.sync() + finally: + restored = dev.set_current(prev) + + assert restored is green_ctx + restored.close() + + +def test_close_current_green_context_raises(init_cuda): + dev = init_cuda + green_ctx = _green_context_or_skip(dev) + + prev = dev.set_current(green_ctx) + try: + with pytest.raises(RuntimeError, match="while it is current"): + green_ctx.close() + finally: + restored = dev.set_current(prev) + + assert restored is green_ctx + restored.close() From faf0d1797a9f9ec13a3227fd0d9ae68dd7f0c1ac Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 25 Apr 2026 02:34:06 +0000 Subject: [PATCH 03/11] Encode green context handle dependencies --- cuda_core/cuda/core/_context.pxd | 2 + cuda_core/cuda/core/_context.pyx | 34 +++- cuda_core/cuda/core/_cpp/REGISTRY_DESIGN.md | 3 +- cuda_core/cuda/core/_cpp/resource_handles.cpp | 184 +++++++++++++----- cuda_core/cuda/core/_cpp/resource_handles.hpp | 22 ++- cuda_core/cuda/core/_device.pyx | 4 + cuda_core/cuda/core/_resource_handles.pxd | 5 +- cuda_core/cuda/core/_resource_handles.pyx | 10 +- cuda_core/cuda/core/_stream.pyx | 19 +- cuda_core/tests/test_green_context.py | 4 + 10 files changed, 225 insertions(+), 62 deletions(-) diff --git a/cuda_core/cuda/core/_context.pxd b/cuda_core/cuda/core/_context.pxd index 5d3ed9bba2f..f115a7f139e 100644 --- a/cuda_core/cuda/core/_context.pxd +++ b/cuda_core/cuda/core/_context.pxd @@ -24,4 +24,6 @@ cdef class Context: @staticmethod cdef Context _from_green_ctx(type cls, GreenCtxHandle h_green_ctx, int device_id) + cdef int _ensure_context_handle(self) except -1 + cpdef close(self) diff --git a/cuda_core/cuda/core/_context.pyx b/cuda_core/cuda/core/_context.pyx index f6facb4284f..b404740a204 100644 --- a/cuda_core/cuda/core/_context.pyx +++ b/cuda_core/cuda/core/_context.pyx @@ -9,7 +9,10 @@ from cuda.core._resource_handles cimport ( ContextHandle, GreenCtxHandle, as_cu, - get_green_ctx_context, + create_context_handle_from_green_ctx, + ensure_context_handle, + get_context_green_ctx, + get_last_error, as_intptr, as_py, ) @@ -34,8 +37,9 @@ cdef class Context: """Create Context from existing ContextHandle (cdef-only factory).""" cdef Context ctx = cls.__new__(cls) ctx._h_context = h_context + ctx._h_green_ctx = get_context_green_ctx(h_context) ctx._device_id = device_id - ctx._is_green = False + ctx._is_green = ctx._h_green_ctx.get() != NULL return ctx @staticmethod @@ -43,15 +47,31 @@ cdef class Context: """Create Context from an owning green context handle.""" cdef Context ctx = cls.__new__(cls) ctx._h_green_ctx = h_green_ctx - ctx._h_context = get_green_ctx_context(h_green_ctx) + ctx._h_context = create_context_handle_from_green_ctx(h_green_ctx) ctx._device_id = device_id ctx._is_green = True return ctx + cdef int _ensure_context_handle(self) except -1: + cdef cydriver.CUcontext raw_ctx + if not self._h_context: + return 0 + if as_cu(self._h_context) != NULL: + return 0 + with nogil: + raw_ctx = ensure_context_handle(self._h_context) + if raw_ctx == NULL: + HANDLE_RETURN(get_last_error()) + raise RuntimeError("Failed to materialize CUDA context from green context") + return 0 + @property def handle(self): """Return the underlying CUcontext handle.""" - if self._h_context.get() == NULL: + if not self._h_context: + return None + self._ensure_context_handle() + if as_cu(self._h_context) == NULL: return None return as_py(self._h_context) @@ -67,7 +87,7 @@ cdef class Context: cpdef close(self): """Release this context wrapper's underlying CUDA handles.""" cdef cydriver.CUcontext current_ctx - if self._h_context.get() != NULL: + if self._h_context and as_cu(self._h_context) != NULL: with nogil: HANDLE_RETURN(cydriver.cuCtxGetCurrent(¤t_ctx)) if current_ctx == as_cu(self._h_context): @@ -82,12 +102,16 @@ cdef class Context: if not isinstance(other, Context): return NotImplemented cdef Context _other = other + self._ensure_context_handle() + _other._ensure_context_handle() return as_intptr(self._h_context) == as_intptr(_other._h_context) def __hash__(self) -> int: + self._ensure_context_handle() return hash(as_intptr(self._h_context)) def __repr__(self) -> str: + self._ensure_context_handle() return f"" diff --git a/cuda_core/cuda/core/_cpp/REGISTRY_DESIGN.md b/cuda_core/cuda/core/_cpp/REGISTRY_DESIGN.md index cbfc609686b..089f98acd93 100644 --- a/cuda_core/cuda/core/_cpp/REGISTRY_DESIGN.md +++ b/cuda_core/cuda/core/_cpp/REGISTRY_DESIGN.md @@ -29,7 +29,8 @@ carries timing/IPC flags, `KernelBox` carries the library dependency). Without this level, a round-tripped handle would produce a new Box with default metadata, losing information that was set at creation. -Instances: `event_registry`, `kernel_registry`, `graph_node_registry`. +Instances: `context_registry`, `stream_registry`, `event_registry`, +`kernel_registry`, `graph_node_registry`. ## Level 2: Resource Handle -> Python Object (Cython) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 749ad07847e..30b1bedea71 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #ifndef _WIN32 @@ -226,22 +227,50 @@ void clear_last_error() noexcept { namespace { struct ContextBox { - CUcontext resource; + mutable CUcontext resource; + GreenCtxHandle h_green_ctx; + mutable std::mutex mutex; + + explicit ContextBox(CUcontext resource, GreenCtxHandle h_green_ctx = {}) + : resource(resource), h_green_ctx(std::move(h_green_ctx)) {} }; struct GreenCtxBox { CUgreenCtx resource; - CUcontext context; }; struct DevResourceDescBox { CUdevResourceDesc resource; }; + +static const ContextBox* get_box(const ContextHandle& h) noexcept { + const CUcontext* p = h.get(); + return reinterpret_cast( + reinterpret_cast(p) - offsetof(ContextBox, resource) + ); +} + +// See REGISTRY_DESIGN.md (Level 1: Driver Handle -> Resource Handle) +static HandleRegistry context_registry; } // namespace ContextHandle create_context_handle_ref(CUcontext ctx) { - auto box = std::make_shared(ContextBox{ctx}); - return ContextHandle(box, &box->resource); + if (!ctx) { + return {}; + } + if (auto h = context_registry.lookup(ctx)) { + return h; + } + auto box = std::shared_ptr( + new ContextBox(ctx), + [](const ContextBox* b) { + context_registry.unregister_handle(b->resource); + delete b; + } + ); + ContextHandle h(box, &box->resource); + context_registry.register_handle(ctx, h); + return h; } static const GreenCtxBox* get_box(const GreenCtxHandle& h) noexcept { @@ -251,32 +280,76 @@ static const GreenCtxBox* get_box(const GreenCtxHandle& h) noexcept { ); } +ContextHandle create_context_handle_from_green_ctx(const GreenCtxHandle& h_green_ctx) { + if (!h_green_ctx) { + return {}; + } + auto box = std::shared_ptr( + new ContextBox(nullptr, h_green_ctx), + [](const ContextBox* b) { + if (b->resource) { + context_registry.unregister_handle(b->resource); + } + delete b; + } + ); + return ContextHandle(box, &box->resource); +} + +CUcontext ensure_context_handle(const ContextHandle& h) noexcept { + if (!h) { + err = CUDA_ERROR_INVALID_CONTEXT; + return nullptr; + } + + const ContextBox* box = get_box(h); + std::lock_guard lock(box->mutex); + if (box->resource) { + return box->resource; + } + if (!box->h_green_ctx) { + err = CUDA_ERROR_INVALID_CONTEXT; + return nullptr; + } + if (!p_cuCtxFromGreenCtx) { + err = CUDA_ERROR_NOT_SUPPORTED; + return nullptr; + } + + GILReleaseGuard gil; + CUcontext ctx = nullptr; + if (CUDA_SUCCESS != (err = p_cuCtxFromGreenCtx(&ctx, as_cu(box->h_green_ctx)))) { + return nullptr; + } + box->resource = ctx; + context_registry.register_handle(ctx, h); + return ctx; +} + +GreenCtxHandle get_context_green_ctx(const ContextHandle& h) noexcept { + if (!h) { + return {}; + } + return get_box(h)->h_green_ctx; +} + GreenCtxHandle create_green_ctx_handle(CUdevResourceDesc desc, CUdevice dev, unsigned int flags) { - if (!p_cuGreenCtxCreate || !p_cuCtxFromGreenCtx) { + if (!p_cuGreenCtxCreate || !p_cuGreenCtxDestroy) { err = CUDA_ERROR_NOT_SUPPORTED; return {}; } GILReleaseGuard gil; CUgreenCtx green_ctx = nullptr; - CUcontext ctx = nullptr; if (CUDA_SUCCESS != (err = p_cuGreenCtxCreate(&green_ctx, desc, dev, flags))) { return {}; } - if (CUDA_SUCCESS != (err = p_cuCtxFromGreenCtx(&ctx, green_ctx))) { - if (p_cuGreenCtxDestroy) { - p_cuGreenCtxDestroy(green_ctx); - } - return {}; - } auto box = std::shared_ptr( - new GreenCtxBox{green_ctx, ctx}, + new GreenCtxBox{green_ctx}, [](const GreenCtxBox* b) { GILReleaseGuard gil; - if (p_cuGreenCtxDestroy) { - p_cuGreenCtxDestroy(b->resource); - } + p_cuGreenCtxDestroy(b->resource); delete b; } ); @@ -287,29 +360,10 @@ GreenCtxHandle create_green_ctx_handle_ref(CUgreenCtx green_ctx) { if (!green_ctx) { return {}; } - if (!p_cuCtxFromGreenCtx) { - err = CUDA_ERROR_NOT_SUPPORTED; - return {}; - } - - GILReleaseGuard gil; - CUcontext ctx = nullptr; - if (CUDA_SUCCESS != (err = p_cuCtxFromGreenCtx(&ctx, green_ctx))) { - return {}; - } - - auto box = std::make_shared(GreenCtxBox{green_ctx, ctx}); + auto box = std::make_shared(GreenCtxBox{green_ctx}); return GreenCtxHandle(box, &box->resource); } -ContextHandle get_green_ctx_context(const GreenCtxHandle& h) noexcept { - if (!h) { - return {}; - } - const GreenCtxBox* box = get_box(h); - return ContextHandle(h, &box->context); -} - DevResourceDescHandle create_dev_resource_desc_handle(CUdevResource* resources, unsigned int nbResources) { if (!p_cuDevResourceGenerateDesc) { err = CUDA_ERROR_NOT_SUPPORTED; @@ -345,14 +399,16 @@ ContextHandle get_primary_context(int device_id) { } auto box = std::shared_ptr( - new ContextBox{ctx}, + new ContextBox(ctx), [device_id](const ContextBox* b) { + context_registry.unregister_handle(b->resource); GILReleaseGuard gil; p_cuDevicePrimaryCtxRelease(device_id); delete b; } ); auto h = ContextHandle(box, &box->resource); + context_registry.register_handle(ctx, h); // Update cache if (static_cast(device_id) >= primary_context_cache.size()) { @@ -381,7 +437,18 @@ ContextHandle get_current_context() { namespace { struct StreamBox { CUstream resource; + ContextHandle h_context; }; + +static const StreamBox* get_box(const StreamHandle& h) noexcept { + const CUstream* p = h.get(); + return reinterpret_cast( + reinterpret_cast(p) - offsetof(StreamBox, resource) + ); +} + +// See REGISTRY_DESIGN.md (Level 1: Driver Handle -> Resource Handle) +static HandleRegistry stream_registry; } // namespace StreamHandle create_stream_handle(const ContextHandle& h_ctx, unsigned int flags, int priority) { @@ -392,22 +459,44 @@ StreamHandle create_stream_handle(const ContextHandle& h_ctx, unsigned int flags } auto box = std::shared_ptr( - new StreamBox{stream}, - [h_ctx](const StreamBox* b) { + new StreamBox{stream, h_ctx}, + [](const StreamBox* b) { + stream_registry.unregister_handle(b->resource); GILReleaseGuard gil; p_cuStreamDestroy(b->resource); delete b; } ); - return StreamHandle(box, &box->resource); + StreamHandle h(box, &box->resource); + stream_registry.register_handle(stream, h); + return h; } StreamHandle create_stream_handle_ref(CUstream stream) { - auto box = std::make_shared(StreamBox{stream}); - return StreamHandle(box, &box->resource); + if (auto h = stream_registry.lookup(stream)) { + return h; + } + auto box = std::shared_ptr( + new StreamBox{stream, {}}, + [](const StreamBox* b) { + stream_registry.unregister_handle(b->resource); + delete b; + } + ); + StreamHandle h(box, &box->resource); + stream_registry.register_handle(stream, h); + return h; } StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) { + if (auto h = stream_registry.lookup(stream)) { + // Reuse handles that already carry structural context metadata, e.g. + // cuda-core-owned streams. Owner-backed foreign streams still need a + // fresh handle so the supplied owner is retained. + if (get_box(h)->h_context) { + return h; + } + } if (!owner) { return create_stream_handle_ref(stream); } @@ -419,8 +508,9 @@ StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) { } Py_INCREF(owner); auto box = std::shared_ptr( - new StreamBox{stream}, + new StreamBox{stream, {}}, [owner](const StreamBox* b) { + stream_registry.unregister_handle(b->resource); GILAcquireGuard gil; if (gil.acquired()) { Py_DECREF(owner); @@ -428,7 +518,13 @@ StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) { delete b; } ); - return StreamHandle(box, &box->resource); + StreamHandle h(box, &box->resource); + stream_registry.register_handle(stream, h); + return h; +} + +ContextHandle get_stream_context(const StreamHandle& h) noexcept { + return h ? get_box(h)->h_context : ContextHandle{}; } StreamHandle get_legacy_stream() { diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index c312d05f570..90b40706ece 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -170,17 +170,24 @@ using FileDescriptorHandle = std::shared_ptr; // Function to create a non-owning context handle (references existing context). ContextHandle create_context_handle_ref(CUcontext ctx); -// Create an owning green context handle. The handle keeps the paired CUcontext -// returned by cuCtxFromGreenCtx in the same control block. +// Create a context handle whose CUcontext view is lazily materialized from +// the provided green context. The returned ContextHandle keeps the green +// context alive. +ContextHandle create_context_handle_from_green_ctx(const GreenCtxHandle& h_green_ctx); + +// Ensure a ContextHandle has a materialized CUcontext value. For green-context +// views this calls cuCtxFromGreenCtx once and caches the non-owning CUcontext. +CUcontext ensure_context_handle(const ContextHandle& h) noexcept; + +// Return the green context dependency associated with a ContextHandle, if any. +GreenCtxHandle get_context_green_ctx(const ContextHandle& h) noexcept; + +// Create an owning green context handle. GreenCtxHandle create_green_ctx_handle(CUdevResourceDesc desc, CUdevice dev, unsigned int flags); // Create a non-owning green context handle. GreenCtxHandle create_green_ctx_handle_ref(CUgreenCtx ctx); -// Get the CUcontext paired with a green context handle. The returned handle -// shares ownership with the green context. -ContextHandle get_green_ctx_context(const GreenCtxHandle& h) noexcept; - // Generate a descriptor for a resource list. CUDA exposes no explicit destroy // API for CUdevResourceDesc; this handle only carries the opaque value. DevResourceDescHandle create_dev_resource_desc_handle(CUdevResource* resources, unsigned int nbResources); @@ -214,6 +221,9 @@ StreamHandle create_stream_handle_ref(CUstream stream); // The owner is responsible for keeping the stream's context alive. StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner); +// Return the context dependency associated with a stream handle, if any. +ContextHandle get_stream_context(const StreamHandle& h) noexcept; + // Get non-owning handle to the legacy default stream (CU_STREAM_LEGACY) // Note: Legacy stream has no specific context dependency. StreamHandle get_legacy_stream(); diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index 0abe80b7e01..87219c4a187 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -1262,6 +1262,7 @@ class Device: if self._has_inited and self._context is not None: prev_owned = self._context # prev_ctx is the previous context + ctx._ensure_context_handle() curr_ctx = as_cu(ctx._h_context) prev_ctx = NULL with nogil: @@ -1270,6 +1271,8 @@ class Device: self._has_inited = True self._context = ctx # Store owning context reference if prev_ctx != NULL: + if prev_owned is not None: + prev_owned._ensure_context_handle() if prev_owned is not None and as_cu(prev_owned._h_context) == prev_ctx: return prev_owned return Context._from_handle(Context, create_context_handle_ref(prev_ctx), self._device_id) @@ -1415,6 +1418,7 @@ class Device: """ self._check_context_initialized() cdef Context ctx = self._context + ctx._ensure_context_handle() return cyEvent._init(cyEvent, self._device_id, ctx._h_context, options, True) def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer: diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 5378b6a2047..40816fe8135 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -115,10 +115,12 @@ cdef void clear_last_error() noexcept nogil # Context handles cdef ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) except+ nogil +cdef ContextHandle create_context_handle_from_green_ctx(const GreenCtxHandle& h_green_ctx) except+ nogil +cdef cydriver.CUcontext ensure_context_handle(const ContextHandle& h) noexcept nogil +cdef GreenCtxHandle get_context_green_ctx(const ContextHandle& h) noexcept nogil cdef GreenCtxHandle create_green_ctx_handle( cydriver.CUdevResourceDesc desc, cydriver.CUdevice dev, unsigned int flags) except+ nogil cdef GreenCtxHandle create_green_ctx_handle_ref(cydriver.CUgreenCtx ctx) except+ nogil -cdef ContextHandle get_green_ctx_context(const GreenCtxHandle& h) noexcept nogil cdef DevResourceDescHandle create_dev_resource_desc_handle( cydriver.CUdevResource* resources, unsigned int nbResources) except+ nogil cdef ContextHandle get_primary_context(int device_id) except+ nogil @@ -129,6 +131,7 @@ cdef StreamHandle create_stream_handle( const ContextHandle& h_ctx, unsigned int flags, int priority) except+ nogil cdef StreamHandle create_stream_handle_ref(cydriver.CUstream stream) except+ nogil cdef StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner) except+ nogil +cdef ContextHandle get_stream_context(const StreamHandle& h) noexcept nogil cdef StreamHandle get_legacy_stream() except+ nogil cdef StreamHandle get_per_thread_stream() except+ nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 1a944999fe4..dd4f204e20d 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -57,12 +57,16 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Context handles ContextHandle create_context_handle_ref "cuda_core::create_context_handle_ref" ( cydriver.CUcontext ctx) except+ nogil + ContextHandle create_context_handle_from_green_ctx "cuda_core::create_context_handle_from_green_ctx" ( + const GreenCtxHandle& h_green_ctx) except+ nogil + cydriver.CUcontext ensure_context_handle "cuda_core::ensure_context_handle" ( + const ContextHandle& h) noexcept nogil + GreenCtxHandle get_context_green_ctx "cuda_core::get_context_green_ctx" ( + const ContextHandle& h) noexcept nogil GreenCtxHandle create_green_ctx_handle "cuda_core::create_green_ctx_handle" ( cydriver.CUdevResourceDesc desc, cydriver.CUdevice dev, unsigned int flags) except+ nogil GreenCtxHandle create_green_ctx_handle_ref "cuda_core::create_green_ctx_handle_ref" ( cydriver.CUgreenCtx ctx) except+ nogil - ContextHandle get_green_ctx_context "cuda_core::get_green_ctx_context" ( - const GreenCtxHandle& h) noexcept nogil DevResourceDescHandle create_dev_resource_desc_handle "cuda_core::create_dev_resource_desc_handle" ( cydriver.CUdevResource* resources, unsigned int nbResources) except+ nogil ContextHandle get_primary_context "cuda_core::get_primary_context" ( @@ -76,6 +80,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUstream stream) except+ nogil StreamHandle create_stream_handle_with_owner "cuda_core::create_stream_handle_with_owner" ( cydriver.CUstream stream, object owner) except+ nogil + ContextHandle get_stream_context "cuda_core::get_stream_context" ( + const StreamHandle& h) noexcept nogil StreamHandle get_legacy_stream "cuda_core::get_legacy_stream" () except+ nogil StreamHandle get_per_thread_stream "cuda_core::get_per_thread_stream" () except+ nogil diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index ca13811cd3c..b4b1f2d0c28 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -30,9 +30,12 @@ from cuda.core._resource_handles cimport ( create_event_handle_noctx, create_stream_handle, create_stream_handle_with_owner, + ensure_context_handle, get_current_context, + get_last_error, get_legacy_stream, get_per_thread_stream, + get_stream_context, as_intptr, as_cu, as_py, @@ -96,7 +99,7 @@ cdef class Stream: """Create a Stream from an existing StreamHandle (cdef-only factory).""" cdef Stream s = cls.__new__(cls) s._h_stream = h_stream - # _h_context is default-initialized to empty ContextHandle by C++ + s._h_context = get_stream_context(h_stream) s._device_id = -1 # lazy init'd (invalid sentinel) s._nonblocking = -1 # lazy init'd s._priority = INT32_MIN # lazy init'd @@ -406,7 +409,15 @@ cdef inline int Stream_ensure_ctx(Stream self) except?-1 nogil: """Ensure the stream's context handle is populated.""" cdef cydriver.CUcontext ctx if not self._h_context: - HANDLE_RETURN(cydriver.cuStreamGetCtx(as_cu(self._h_stream), &ctx)) + self._h_context = get_stream_context(self._h_stream) + if self._h_context: + if as_cu(self._h_context) == NULL: + ctx = ensure_context_handle(self._h_context) + if ctx == NULL: + HANDLE_RETURN(get_last_error()) + return 0 + HANDLE_RETURN(cydriver.cuStreamGetCtx(as_cu(self._h_stream), &ctx)) + if ctx != NULL: with gil: self._h_context = create_context_handle_ref(ctx) return 0 @@ -416,13 +427,15 @@ cdef inline int Stream_ensure_ctx_device(Stream self) except?-1: """Ensure the stream's context and device_id are populated.""" cdef cydriver.CUcontext ctx cdef cydriver.CUdevice target_dev + cdef ContextHandle current_context cdef bint switch_context if self._device_id < 0: with nogil: # Get device ID from context, switching context temporarily if needed Stream_ensure_ctx(self) - switch_context = (get_current_context() != self._h_context) + current_context = get_current_context() + switch_context = (as_cu(current_context) != as_cu(self._h_context)) if switch_context: HANDLE_RETURN(cydriver.cuCtxPushCurrent(as_cu(self._h_context))) HANDLE_RETURN(cydriver.cuCtxGetDevice(&target_dev)) diff --git a/cuda_core/tests/test_green_context.py b/cuda_core/tests/test_green_context.py index a287383c404..d92a5089e8b 100644 --- a/cuda_core/tests/test_green_context.py +++ b/cuda_core/tests/test_green_context.py @@ -132,6 +132,10 @@ def test_green_context_push_model_creates_stream_and_event(init_cuda): try: stream = dev.create_stream() event = stream.record() + assert stream.context.is_green + assert stream.context == green_ctx + assert event.context.is_green + assert event.context == green_ctx stream.sync() event.sync() finally: From 7c35ef345a038a8490e64e009886a37b14aef8cc Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 25 Apr 2026 03:07:55 +0000 Subject: [PATCH 04/11] Simplify green context view handles --- cuda_core/cuda/core/_context.pxd | 2 - cuda_core/cuda/core/_context.pyx | 22 +------ cuda_core/cuda/core/_cpp/resource_handles.cpp | 63 +++++-------------- cuda_core/cuda/core/_cpp/resource_handles.hpp | 10 +-- cuda_core/cuda/core/_device.pyx | 4 -- cuda_core/cuda/core/_resource_handles.pxd | 1 - cuda_core/cuda/core/_resource_handles.pyx | 2 - cuda_core/cuda/core/_stream.pyx | 6 -- 8 files changed, 22 insertions(+), 88 deletions(-) diff --git a/cuda_core/cuda/core/_context.pxd b/cuda_core/cuda/core/_context.pxd index f115a7f139e..5d3ed9bba2f 100644 --- a/cuda_core/cuda/core/_context.pxd +++ b/cuda_core/cuda/core/_context.pxd @@ -24,6 +24,4 @@ cdef class Context: @staticmethod cdef Context _from_green_ctx(type cls, GreenCtxHandle h_green_ctx, int device_id) - cdef int _ensure_context_handle(self) except -1 - cpdef close(self) diff --git a/cuda_core/cuda/core/_context.pyx b/cuda_core/cuda/core/_context.pyx index b404740a204..727351fc809 100644 --- a/cuda_core/cuda/core/_context.pyx +++ b/cuda_core/cuda/core/_context.pyx @@ -10,7 +10,6 @@ from cuda.core._resource_handles cimport ( GreenCtxHandle, as_cu, create_context_handle_from_green_ctx, - ensure_context_handle, get_context_green_ctx, get_last_error, as_intptr, @@ -48,29 +47,18 @@ cdef class Context: cdef Context ctx = cls.__new__(cls) ctx._h_green_ctx = h_green_ctx ctx._h_context = create_context_handle_from_green_ctx(h_green_ctx) + if not ctx._h_context: + HANDLE_RETURN(get_last_error()) + raise RuntimeError("Failed to create CUDA context view from green context") ctx._device_id = device_id ctx._is_green = True return ctx - cdef int _ensure_context_handle(self) except -1: - cdef cydriver.CUcontext raw_ctx - if not self._h_context: - return 0 - if as_cu(self._h_context) != NULL: - return 0 - with nogil: - raw_ctx = ensure_context_handle(self._h_context) - if raw_ctx == NULL: - HANDLE_RETURN(get_last_error()) - raise RuntimeError("Failed to materialize CUDA context from green context") - return 0 - @property def handle(self): """Return the underlying CUcontext handle.""" if not self._h_context: return None - self._ensure_context_handle() if as_cu(self._h_context) == NULL: return None return as_py(self._h_context) @@ -102,16 +90,12 @@ cdef class Context: if not isinstance(other, Context): return NotImplemented cdef Context _other = other - self._ensure_context_handle() - _other._ensure_context_handle() return as_intptr(self._h_context) == as_intptr(_other._h_context) def __hash__(self) -> int: - self._ensure_context_handle() return hash(as_intptr(self._h_context)) def __repr__(self) -> str: - self._ensure_context_handle() return f"" diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 30b1bedea71..92801f7759e 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #ifndef _WIN32 @@ -227,12 +226,8 @@ void clear_last_error() noexcept { namespace { struct ContextBox { - mutable CUcontext resource; + CUcontext resource; GreenCtxHandle h_green_ctx; - mutable std::mutex mutex; - - explicit ContextBox(CUcontext resource, GreenCtxHandle h_green_ctx = {}) - : resource(resource), h_green_ctx(std::move(h_green_ctx)) {} }; struct GreenCtxBox { @@ -262,7 +257,7 @@ ContextHandle create_context_handle_ref(CUcontext ctx) { return h; } auto box = std::shared_ptr( - new ContextBox(ctx), + new ContextBox{ctx, {}}, [](const ContextBox* b) { context_registry.unregister_handle(b->resource); delete b; @@ -273,57 +268,31 @@ ContextHandle create_context_handle_ref(CUcontext ctx) { return h; } -static const GreenCtxBox* get_box(const GreenCtxHandle& h) noexcept { - const CUgreenCtx* p = h.get(); - return reinterpret_cast( - reinterpret_cast(p) - offsetof(GreenCtxBox, resource) - ); -} - ContextHandle create_context_handle_from_green_ctx(const GreenCtxHandle& h_green_ctx) { if (!h_green_ctx) { return {}; } - auto box = std::shared_ptr( - new ContextBox(nullptr, h_green_ctx), - [](const ContextBox* b) { - if (b->resource) { - context_registry.unregister_handle(b->resource); - } - delete b; - } - ); - return ContextHandle(box, &box->resource); -} - -CUcontext ensure_context_handle(const ContextHandle& h) noexcept { - if (!h) { - err = CUDA_ERROR_INVALID_CONTEXT; - return nullptr; - } - - const ContextBox* box = get_box(h); - std::lock_guard lock(box->mutex); - if (box->resource) { - return box->resource; - } - if (!box->h_green_ctx) { - err = CUDA_ERROR_INVALID_CONTEXT; - return nullptr; - } if (!p_cuCtxFromGreenCtx) { err = CUDA_ERROR_NOT_SUPPORTED; - return nullptr; + return {}; } GILReleaseGuard gil; CUcontext ctx = nullptr; - if (CUDA_SUCCESS != (err = p_cuCtxFromGreenCtx(&ctx, as_cu(box->h_green_ctx)))) { - return nullptr; + if (CUDA_SUCCESS != (err = p_cuCtxFromGreenCtx(&ctx, as_cu(h_green_ctx)))) { + return {}; } - box->resource = ctx; + + auto box = std::shared_ptr( + new ContextBox{ctx, h_green_ctx}, + [](const ContextBox* b) { + context_registry.unregister_handle(b->resource); + delete b; + } + ); + ContextHandle h(box, &box->resource); context_registry.register_handle(ctx, h); - return ctx; + return h; } GreenCtxHandle get_context_green_ctx(const ContextHandle& h) noexcept { @@ -399,7 +368,7 @@ ContextHandle get_primary_context(int device_id) { } auto box = std::shared_ptr( - new ContextBox(ctx), + new ContextBox{ctx, {}}, [device_id](const ContextBox* b) { context_registry.unregister_handle(b->resource); GILReleaseGuard gil; diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 90b40706ece..277afcb9088 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -170,15 +170,11 @@ using FileDescriptorHandle = std::shared_ptr; // Function to create a non-owning context handle (references existing context). ContextHandle create_context_handle_ref(CUcontext ctx); -// Create a context handle whose CUcontext view is lazily materialized from -// the provided green context. The returned ContextHandle keeps the green -// context alive. +// Create a context handle for the CUcontext view of the provided green context. +// The returned ContextHandle keeps the green context alive, but the CUcontext +// view is non-owning and is not destroyed independently. ContextHandle create_context_handle_from_green_ctx(const GreenCtxHandle& h_green_ctx); -// Ensure a ContextHandle has a materialized CUcontext value. For green-context -// views this calls cuCtxFromGreenCtx once and caches the non-owning CUcontext. -CUcontext ensure_context_handle(const ContextHandle& h) noexcept; - // Return the green context dependency associated with a ContextHandle, if any. GreenCtxHandle get_context_green_ctx(const ContextHandle& h) noexcept; diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index 87219c4a187..0abe80b7e01 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -1262,7 +1262,6 @@ class Device: if self._has_inited and self._context is not None: prev_owned = self._context # prev_ctx is the previous context - ctx._ensure_context_handle() curr_ctx = as_cu(ctx._h_context) prev_ctx = NULL with nogil: @@ -1271,8 +1270,6 @@ class Device: self._has_inited = True self._context = ctx # Store owning context reference if prev_ctx != NULL: - if prev_owned is not None: - prev_owned._ensure_context_handle() if prev_owned is not None and as_cu(prev_owned._h_context) == prev_ctx: return prev_owned return Context._from_handle(Context, create_context_handle_ref(prev_ctx), self._device_id) @@ -1418,7 +1415,6 @@ class Device: """ self._check_context_initialized() cdef Context ctx = self._context - ctx._ensure_context_handle() return cyEvent._init(cyEvent, self._device_id, ctx._h_context, options, True) def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer: diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 40816fe8135..f1298befc95 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -116,7 +116,6 @@ cdef void clear_last_error() noexcept nogil # Context handles cdef ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) except+ nogil cdef ContextHandle create_context_handle_from_green_ctx(const GreenCtxHandle& h_green_ctx) except+ nogil -cdef cydriver.CUcontext ensure_context_handle(const ContextHandle& h) noexcept nogil cdef GreenCtxHandle get_context_green_ctx(const ContextHandle& h) noexcept nogil cdef GreenCtxHandle create_green_ctx_handle( cydriver.CUdevResourceDesc desc, cydriver.CUdevice dev, unsigned int flags) except+ nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index dd4f204e20d..329bbaef24f 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -59,8 +59,6 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUcontext ctx) except+ nogil ContextHandle create_context_handle_from_green_ctx "cuda_core::create_context_handle_from_green_ctx" ( const GreenCtxHandle& h_green_ctx) except+ nogil - cydriver.CUcontext ensure_context_handle "cuda_core::ensure_context_handle" ( - const ContextHandle& h) noexcept nogil GreenCtxHandle get_context_green_ctx "cuda_core::get_context_green_ctx" ( const ContextHandle& h) noexcept nogil GreenCtxHandle create_green_ctx_handle "cuda_core::create_green_ctx_handle" ( diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index b4b1f2d0c28..0c1ab6f4b2c 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -30,9 +30,7 @@ from cuda.core._resource_handles cimport ( create_event_handle_noctx, create_stream_handle, create_stream_handle_with_owner, - ensure_context_handle, get_current_context, - get_last_error, get_legacy_stream, get_per_thread_stream, get_stream_context, @@ -411,10 +409,6 @@ cdef inline int Stream_ensure_ctx(Stream self) except?-1 nogil: if not self._h_context: self._h_context = get_stream_context(self._h_stream) if self._h_context: - if as_cu(self._h_context) == NULL: - ctx = ensure_context_handle(self._h_context) - if ctx == NULL: - HANDLE_RETURN(get_last_error()) return 0 HANDLE_RETURN(cydriver.cuStreamGetCtx(as_cu(self._h_stream), &ctx)) if ctx != NULL: From da58b7d5e7d72f1e6bfaf90e500b455af29d9935 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 25 Apr 2026 03:34:57 +0000 Subject: [PATCH 05/11] Simplify green context descriptor handling --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 30 +++++-------------- cuda_core/cuda/core/_cpp/resource_handles.hpp | 22 ++------------ cuda_core/cuda/core/_device.pyx | 13 ++------ cuda_core/cuda/core/_device_resources.pyx | 10 +++---- cuda_core/cuda/core/_resource_handles.pxd | 9 ++---- cuda_core/cuda/core/_resource_handles.pyx | 6 ++-- 6 files changed, 22 insertions(+), 68 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 92801f7759e..3119f546b55 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -234,10 +234,6 @@ struct GreenCtxBox { CUgreenCtx resource; }; -struct DevResourceDescBox { - CUdevResourceDesc resource; -}; - static const ContextBox* get_box(const ContextHandle& h) noexcept { const CUcontext* p = h.get(); return reinterpret_cast( @@ -302,13 +298,19 @@ GreenCtxHandle get_context_green_ctx(const ContextHandle& h) noexcept { return get_box(h)->h_green_ctx; } -GreenCtxHandle create_green_ctx_handle(CUdevResourceDesc desc, CUdevice dev, unsigned int flags) { - if (!p_cuGreenCtxCreate || !p_cuGreenCtxDestroy) { +GreenCtxHandle create_green_ctx_handle(CUdevResource* resources, unsigned int nbResources, + CUdevice dev, unsigned int flags) { + if (!p_cuDevResourceGenerateDesc || !p_cuGreenCtxCreate || !p_cuGreenCtxDestroy) { err = CUDA_ERROR_NOT_SUPPORTED; return {}; } GILReleaseGuard gil; + CUdevResourceDesc desc = nullptr; + if (CUDA_SUCCESS != (err = p_cuDevResourceGenerateDesc(&desc, resources, nbResources))) { + return {}; + } + CUgreenCtx green_ctx = nullptr; if (CUDA_SUCCESS != (err = p_cuGreenCtxCreate(&green_ctx, desc, dev, flags))) { return {}; @@ -333,22 +335,6 @@ GreenCtxHandle create_green_ctx_handle_ref(CUgreenCtx green_ctx) { return GreenCtxHandle(box, &box->resource); } -DevResourceDescHandle create_dev_resource_desc_handle(CUdevResource* resources, unsigned int nbResources) { - if (!p_cuDevResourceGenerateDesc) { - err = CUDA_ERROR_NOT_SUPPORTED; - return {}; - } - - GILReleaseGuard gil; - CUdevResourceDesc desc = nullptr; - if (CUDA_SUCCESS != (err = p_cuDevResourceGenerateDesc(&desc, resources, nbResources))) { - return {}; - } - - auto box = std::make_shared(DevResourceDescBox{desc}); - return DevResourceDescHandle(box, &box->resource); -} - // Thread-local cache of primary contexts indexed by device ID static thread_local std::vector primary_context_cache; diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 277afcb9088..0f91113e4e4 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -147,7 +147,6 @@ extern NvJitLinkDestroyFn p_nvJitLinkDestroy; using ContextHandle = std::shared_ptr; using GreenCtxHandle = std::shared_ptr; -using DevResourceDescHandle = std::shared_ptr; using StreamHandle = std::shared_ptr; using EventHandle = std::shared_ptr; using MemoryPoolHandle = std::shared_ptr; @@ -178,16 +177,13 @@ ContextHandle create_context_handle_from_green_ctx(const GreenCtxHandle& h_green // Return the green context dependency associated with a ContextHandle, if any. GreenCtxHandle get_context_green_ctx(const ContextHandle& h) noexcept; -// Create an owning green context handle. -GreenCtxHandle create_green_ctx_handle(CUdevResourceDesc desc, CUdevice dev, unsigned int flags); +// Create an owning green context handle from a list of device resources. +GreenCtxHandle create_green_ctx_handle(CUdevResource* resources, unsigned int nbResources, + CUdevice dev, unsigned int flags); // Create a non-owning green context handle. GreenCtxHandle create_green_ctx_handle_ref(CUgreenCtx ctx); -// Generate a descriptor for a resource list. CUDA exposes no explicit destroy -// API for CUdevResourceDesc; this handle only carries the opaque value. -DevResourceDescHandle create_dev_resource_desc_handle(CUdevResource* resources, unsigned int nbResources); - // Get handle to the primary context for a device (with thread-local caching) // Returns empty handle on error (caller must check) ContextHandle get_primary_context(int device_id); @@ -532,10 +528,6 @@ inline CUgreenCtx as_cu(const GreenCtxHandle& h) noexcept { return h ? *h : nullptr; } -inline CUdevResourceDesc as_cu(const DevResourceDescHandle& h) noexcept { - return h ? *h : nullptr; -} - inline CUstream as_cu(const StreamHandle& h) noexcept { return h ? *h : nullptr; } @@ -598,10 +590,6 @@ inline std::intptr_t as_intptr(const GreenCtxHandle& h) noexcept { return reinterpret_cast(as_cu(h)); } -inline std::intptr_t as_intptr(const DevResourceDescHandle& h) noexcept { - return reinterpret_cast(as_cu(h)); -} - inline std::intptr_t as_intptr(const StreamHandle& h) noexcept { return reinterpret_cast(as_cu(h)); } @@ -696,10 +684,6 @@ inline PyObject* as_py(const GreenCtxHandle& h) noexcept { return detail::make_py("cuda.bindings.driver", "CUgreenCtx", as_intptr(h)); } -inline PyObject* as_py(const DevResourceDescHandle& h) noexcept { - return detail::make_py("cuda.bindings.driver", "CUdevResourceDesc", as_intptr(h)); -} - inline PyObject* as_py(const StreamHandle& h) noexcept { return detail::make_py("cuda.bindings.driver", "CUstream", as_intptr(h)); } diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index 0abe80b7e01..df1f95f158b 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -18,19 +18,15 @@ from cuda.core._device_resources cimport DeviceResources, SMResource, WorkqueueR from cuda.core._device_resources import ( DeviceResources, SMResource, - SMResourceOptions, WorkqueueResource, - WorkqueueResourceOptions, ) from cuda.core._event cimport Event as cyEvent from cuda.core._event import Event, EventOptions from cuda.core._memory._buffer cimport Buffer, MemoryResource from cuda.core._resource_handles cimport ( ContextHandle, - DevResourceDescHandle, GreenCtxHandle, create_context_handle_ref, - create_dev_resource_desc_handle, create_green_ctx_handle, get_primary_context, get_last_error, @@ -1308,7 +1304,6 @@ class Device: cdef SMResource sm_res cdef WorkqueueResource wq_res cdef cydriver.CUdevResource* c_resources = NULL - cdef DevResourceDescHandle h_desc cdef GreenCtxHandle h_green if options is None: @@ -1346,13 +1341,9 @@ class Device: else: raise TypeError(f"Unsupported context resource type: {type(res)}") - h_desc = create_dev_resource_desc_handle(c_resources, n_resources) - if h_desc.get() == NULL: - HANDLE_RETURN(get_last_error()) - raise RuntimeError("Failed to generate CUDA device resource descriptor") - h_green = create_green_ctx_handle( - as_cu(h_desc), + c_resources, + n_resources, self._device_id, cydriver.CUgreenCtxCreate_flags.CU_GREEN_CTX_DEFAULT_STREAM, ) diff --git a/cuda_core/cuda/core/_device_resources.pyx b/cuda_core/cuda/core/_device_resources.pyx index 1258e95e10c..2db864b039e 100644 --- a/cuda_core/cuda/core/_device_resources.pyx +++ b/cuda_core/cuda/core/_device_resources.pyx @@ -460,22 +460,22 @@ cdef class DeviceResources: """Query workqueue resources from this device.""" _check_green_ctx_support() _check_workqueue_support() - cdef cydriver.CUdevResource wq_config - cdef cydriver.CUdevResource wq + cdef cydriver.CUdevResource _wq_config + cdef cydriver.CUdevResource _wq IF CUDA_CORE_BUILD_MAJOR >= 13: with nogil: HANDLE_RETURN(cydriver.cuDeviceGetDevResource( self._device_id, - &wq_config, + &_wq_config, cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG, )) HANDLE_RETURN(cydriver.cuDeviceGetDevResource( self._device_id, - &wq, + &_wq, cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE, )) - return WorkqueueResource._from_dev_resources(wq_config, wq) + return WorkqueueResource._from_dev_resources(_wq_config, _wq) ELSE: raise NotImplementedError( "WorkqueueResource requires cuda.core to be built with CUDA 13.x bindings" diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index f1298befc95..ade94beb94b 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -21,7 +21,6 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Handle types ctypedef shared_ptr[const cydriver.CUcontext] ContextHandle ctypedef shared_ptr[const cydriver.CUgreenCtx] GreenCtxHandle - ctypedef shared_ptr[const cydriver.CUdevResourceDesc] DevResourceDescHandle ctypedef shared_ptr[const cydriver.CUstream] StreamHandle ctypedef shared_ptr[const cydriver.CUevent] EventHandle ctypedef shared_ptr[const cydriver.CUmemoryPool] MemoryPoolHandle @@ -48,7 +47,6 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # as_cu() - extract the raw CUDA handle (inline C++) cydriver.CUcontext as_cu(ContextHandle h) noexcept nogil cydriver.CUgreenCtx as_cu(GreenCtxHandle h) noexcept nogil - cydriver.CUdevResourceDesc as_cu(DevResourceDescHandle h) noexcept nogil cydriver.CUstream as_cu(StreamHandle h) noexcept nogil cydriver.CUevent as_cu(EventHandle h) noexcept nogil cydriver.CUmemoryPool as_cu(MemoryPoolHandle h) noexcept nogil @@ -66,7 +64,6 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # as_intptr() - extract handle as intptr_t for Python interop (inline C++) intptr_t as_intptr(ContextHandle h) noexcept nogil intptr_t as_intptr(GreenCtxHandle h) noexcept nogil - intptr_t as_intptr(DevResourceDescHandle h) noexcept nogil intptr_t as_intptr(StreamHandle h) noexcept nogil intptr_t as_intptr(EventHandle h) noexcept nogil intptr_t as_intptr(MemoryPoolHandle h) noexcept nogil @@ -85,7 +82,6 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # as_py() - convert handle to Python wrapper object (inline C++; requires GIL) object as_py(ContextHandle h) object as_py(GreenCtxHandle h) - object as_py(DevResourceDescHandle h) object as_py(StreamHandle h) object as_py(EventHandle h) object as_py(MemoryPoolHandle h) @@ -118,10 +114,9 @@ cdef ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) except+ nog cdef ContextHandle create_context_handle_from_green_ctx(const GreenCtxHandle& h_green_ctx) except+ nogil cdef GreenCtxHandle get_context_green_ctx(const ContextHandle& h) noexcept nogil cdef GreenCtxHandle create_green_ctx_handle( - cydriver.CUdevResourceDesc desc, cydriver.CUdevice dev, unsigned int flags) except+ nogil + cydriver.CUdevResource* resources, unsigned int nbResources, + cydriver.CUdevice dev, unsigned int flags) except+ nogil cdef GreenCtxHandle create_green_ctx_handle_ref(cydriver.CUgreenCtx ctx) except+ nogil -cdef DevResourceDescHandle create_dev_resource_desc_handle( - cydriver.CUdevResource* resources, unsigned int nbResources) except+ nogil cdef ContextHandle get_primary_context(int device_id) except+ nogil cdef ContextHandle get_current_context() except+ nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 329bbaef24f..748a1c49e67 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -21,7 +21,6 @@ from cuda.bindings cimport cynvjitlink from ._resource_handles cimport ( ContextHandle, GreenCtxHandle, - DevResourceDescHandle, StreamHandle, EventHandle, MemoryPoolHandle, @@ -62,11 +61,10 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": GreenCtxHandle get_context_green_ctx "cuda_core::get_context_green_ctx" ( const ContextHandle& h) noexcept nogil GreenCtxHandle create_green_ctx_handle "cuda_core::create_green_ctx_handle" ( - cydriver.CUdevResourceDesc desc, cydriver.CUdevice dev, unsigned int flags) except+ nogil + cydriver.CUdevResource* resources, unsigned int nbResources, + cydriver.CUdevice dev, unsigned int flags) except+ nogil GreenCtxHandle create_green_ctx_handle_ref "cuda_core::create_green_ctx_handle_ref" ( cydriver.CUgreenCtx ctx) except+ nogil - DevResourceDescHandle create_dev_resource_desc_handle "cuda_core::create_dev_resource_desc_handle" ( - cydriver.CUdevResource* resources, unsigned int nbResources) except+ nogil ContextHandle get_primary_context "cuda_core::get_primary_context" ( int device_id) except+ nogil ContextHandle get_current_context "cuda_core::get_current_context" () except+ nogil From ac5c0fc4f6c5668bee0b618de2d174b4e65b53e5 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 25 Apr 2026 04:41:00 +0000 Subject: [PATCH 06/11] Expand green context test coverage with proper pytest patterns Restructure tests into fixtures + classes with full resource cleanup: - Fixtures: sm_resource, wq_resource, green_ctx (with CUDAError skip), green_ctx_active (with try/finally restore), fill_kernel - _use_green_ctx context manager for safe push/pop in all tests - TestSMResourceQuery: properties, arch constraints per CC - TestSMResourceSplit: single/two-group splits, discovery, alignment, dry-run vs real parity - TestGreenContextKernelLaunch: compile + launch + verify in green ctx, two independent green contexts, SM + workqueue combined All set_current calls are paired with restore in finally blocks to prevent context stack leaks on test failure. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/tests/test_green_context.py | 396 +++++++++++++++++++++----- 1 file changed, 320 insertions(+), 76 deletions(-) diff --git a/cuda_core/tests/test_green_context.py b/cuda_core/tests/test_green_context.py index d92a5089e8b..54e01b2f0f0 100644 --- a/cuda_core/tests/test_green_context.py +++ b/cuda_core/tests/test_green_context.py @@ -2,40 +2,120 @@ # # SPDX-License-Identifier: Apache-2.0 +import contextlib + +import numpy as np import pytest from cuda.core import ( ContextOptions, DeviceResources, + LaunchConfig, + LegacyPinnedMemoryResource, + Program, + ProgramOptions, SMResource, SMResourceOptions, WorkqueueResource, WorkqueueResourceOptions, + launch, ) from cuda.core._utils.cuda_utils import CUDAError +# --------------------------------------------------------------------------- +# Kernel source +# --------------------------------------------------------------------------- + +_FILL_KERNEL = r""" +extern "C" __global__ void fill(int* out, int value, int n) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) { + out[tid] = value; + } +} +""" + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + -def _sm_resource_or_skip(dev): +@pytest.fixture +def sm_resource(init_cuda): + """Query SM resources from the device, skip if unsupported.""" try: - return dev.resources.sm + return init_cuda.resources.sm except (NotImplementedError, CUDAError) as exc: pytest.skip(str(exc)) -def _split_or_skip(sm, options, **kwargs): +@pytest.fixture +def wq_resource(init_cuda): + """Query workqueue resources from the device, skip if unsupported.""" try: - return sm.split(options, **kwargs) + return init_cuda.resources.workqueue except (NotImplementedError, CUDAError) as exc: pytest.skip(str(exc)) -def _green_context_or_skip(dev): - sm = _sm_resource_or_skip(dev) - groups, _ = _split_or_skip(sm, SMResourceOptions(count=None)) +@pytest.fixture +def green_ctx(init_cuda, sm_resource): + """Create a single-group green context with proper teardown.""" + groups, _ = sm_resource.split(SMResourceOptions(count=None)) try: - return dev.create_context(ContextOptions(resources=[groups[0]])) + ctx = init_cuda.create_context(ContextOptions(resources=[groups[0]])) except CUDAError as exc: pytest.skip(str(exc)) + yield ctx + ctx.close() + + +@pytest.fixture +def green_ctx_active(init_cuda, green_ctx): + """Set a green context as current and restore the previous on teardown. + + Yields (dev, green_ctx, stream) for use in kernel launch tests. + """ + dev = init_cuda + prev = dev.set_current(green_ctx) + try: + stream = dev.create_stream() + yield dev, green_ctx, stream + finally: + dev.set_current(prev) + + +@pytest.fixture +def fill_kernel(init_cuda): + """Compile the fill kernel for the current device.""" + dev = init_cuda + opts = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(_FILL_KERNEL, code_type="c++", options=opts) + mod = prog.compile("cubin") + return mod.get_kernel("fill") + + +def _aligned_half(sm): + """Compute half the SM count, rounded down to min_partition_size alignment.""" + min_size = sm.min_partition_size + half = (sm.sm_count // 2 // min_size) * min_size + return half + + +@contextlib.contextmanager +def _use_green_ctx(dev, ctx): + """Context manager: set green ctx current, restore previous on exit.""" + prev = dev.set_current(ctx) + try: + yield + finally: + dev.set_current(prev) + + +# --------------------------------------------------------------------------- +# Construction / type tests +# --------------------------------------------------------------------------- def test_not_user_constructible(): @@ -56,81 +136,186 @@ def test_create_context_without_resources_stays_unimplemented(init_cuda): init_cuda.create_context(object()) -def test_sm_resource_query(init_cuda): - sm = _sm_resource_or_skip(init_cuda) +# --------------------------------------------------------------------------- +# SM resource query +# --------------------------------------------------------------------------- - assert sm.handle != 0 - assert sm.sm_count > 0 - assert sm.min_partition_size > 0 - assert sm.coscheduled_alignment > 0 - assert isinstance(sm.flags, int) - assert not hasattr(sm, "memory_node_id") +class TestSMResourceQuery: + def test_properties(self, sm_resource): + assert sm_resource.handle != 0 + assert sm_resource.sm_count > 0 + assert sm_resource.min_partition_size > 0 + assert sm_resource.coscheduled_alignment > 0 + assert isinstance(sm_resource.flags, int) -def test_workqueue_resource_query_and_configure(init_cuda): - try: - wq = init_cuda.resources.workqueue - except (NotImplementedError, CUDAError) as exc: - pytest.skip(str(exc)) + def test_no_memory_node_id_in_v1(self, sm_resource): + """memory_node_id is deferred to v1.1 (CUDA 13.4).""" + assert not hasattr(sm_resource, "memory_node_id") - assert wq.handle != 0 - assert wq.configure(WorkqueueResourceOptions(sharing_scope=None)) is None - assert wq.configure(WorkqueueResourceOptions(sharing_scope="green_ctx_balanced")) is None - with pytest.raises(ValueError, match="Unknown sharing_scope"): - wq.configure(WorkqueueResourceOptions(sharing_scope="bogus")) + def test_arch_constraints_pre_hopper(self, init_cuda, sm_resource): + if init_cuda.compute_capability >= (9, 0): + pytest.skip("Test is for pre-Hopper architectures") + assert sm_resource.min_partition_size >= 2 + assert sm_resource.coscheduled_alignment >= 2 + def test_arch_constraints_hopper_plus(self, init_cuda, sm_resource): + if init_cuda.compute_capability < (9, 0): + pytest.skip("Test is for Hopper+ architectures") + assert sm_resource.min_partition_size >= 8 + assert sm_resource.coscheduled_alignment >= 8 -def test_sm_resource_split_validation(init_cuda): - sm = _sm_resource_or_skip(init_cuda) - count = sm.min_partition_size - with pytest.raises(ValueError, match="count is scalar"): - sm.split(SMResourceOptions(count=count, coscheduled_sm_count=(count, count))) +# --------------------------------------------------------------------------- +# Workqueue resource +# --------------------------------------------------------------------------- - with pytest.raises(ValueError, match="expected 2"): - sm.split(SMResourceOptions(count=(count, count), coscheduled_sm_count=(count, count, count))) - with pytest.raises(ValueError, match="count must be non-negative"): - sm.split(SMResourceOptions(count=-1)) +class TestWorkqueueResource: + def test_query(self, wq_resource): + assert wq_resource.handle != 0 + def test_configure_none_is_noop(self, wq_resource): + assert wq_resource.configure(WorkqueueResourceOptions(sharing_scope=None)) is None -def test_sm_resource_split_dry_run_cannot_create_context(init_cuda): - sm = _sm_resource_or_skip(init_cuda) - groups, _ = _split_or_skip(sm, SMResourceOptions(count=None), dry_run=True) + def test_configure_valid_scope(self, wq_resource): + wq_resource.configure(WorkqueueResourceOptions(sharing_scope="green_ctx_balanced")) - assert len(groups) == 1 - with pytest.raises(ValueError, match="dry-run SMResource"): - init_cuda.create_context(ContextOptions(resources=[groups[0]])) + def test_configure_invalid_scope_raises(self, wq_resource): + with pytest.raises(ValueError, match="Unknown sharing_scope"): + wq_resource.configure(WorkqueueResourceOptions(sharing_scope="bogus")) -def test_create_green_context(init_cuda): - ctx = _green_context_or_skip(init_cuda) +# --------------------------------------------------------------------------- +# SM resource split — validation +# --------------------------------------------------------------------------- - assert ctx.is_green - assert ctx.handle is not None - ctx.close() +class TestSMResourceSplitValidation: + def test_scalar_count_with_sequence_field_raises(self, sm_resource): + count = sm_resource.min_partition_size + with pytest.raises(ValueError, match="count is scalar"): + sm_resource.split( + SMResourceOptions( + count=count, + coscheduled_sm_count=(count, count), + ) + ) -def test_set_current_swap_preserves_green_context(init_cuda): - dev = init_cuda - green_ctx = _green_context_or_skip(dev) + def test_sequence_length_mismatch_raises(self, sm_resource): + count = sm_resource.min_partition_size + with pytest.raises(ValueError, match="expected 2"): + sm_resource.split( + SMResourceOptions( + count=(count, count), + coscheduled_sm_count=(count, count, count), + ) + ) - prev = dev.set_current(green_ctx) - assert prev is not None + def test_negative_count_raises(self, sm_resource): + with pytest.raises(ValueError, match="count must be non-negative"): + sm_resource.split(SMResourceOptions(count=-1)) - restored = dev.set_current(prev) - assert restored is green_ctx - assert restored.is_green - restored.close() + def test_dry_run_cannot_create_context(self, init_cuda, sm_resource): + groups, _ = sm_resource.split(SMResourceOptions(count=None), dry_run=True) + assert len(groups) == 1 + with pytest.raises(ValueError, match="dry-run SMResource"): + init_cuda.create_context(ContextOptions(resources=[groups[0]])) -def test_green_context_push_model_creates_stream_and_event(init_cuda): - dev = init_cuda - green_ctx = _green_context_or_skip(dev) +# --------------------------------------------------------------------------- +# SM resource split — functional +# --------------------------------------------------------------------------- + + +class TestSMResourceSplit: + def test_single_group_counts(self, sm_resource): + """Single-group split: group gets at least requested SMs.""" + requested = sm_resource.min_partition_size + groups, rem = sm_resource.split(SMResourceOptions(count=requested)) + + assert len(groups) == 1 + assert groups[0].sm_count >= requested + assert groups[0].sm_count + rem.sm_count <= sm_resource.sm_count + + def test_discovery_mode(self, sm_resource): + """count=None auto-detects a valid SM count.""" + groups, _ = sm_resource.split(SMResourceOptions(count=None)) + + assert len(groups) == 1 + assert groups[0].sm_count >= sm_resource.min_partition_size + + def test_discovery_respects_alignment(self, sm_resource): + groups, _ = sm_resource.split(SMResourceOptions(count=None)) + + if sm_resource.coscheduled_alignment > 0: + assert groups[0].sm_count % sm_resource.coscheduled_alignment == 0 + + def test_two_groups(self, sm_resource): + """Two-group split with explicit aligned counts.""" + half = _aligned_half(sm_resource) + if half < sm_resource.min_partition_size: + pytest.skip("Not enough SMs for a 2-group split") + + groups, rem = sm_resource.split(SMResourceOptions(count=(half, half))) + + assert len(groups) == 2 + assert groups[0].sm_count > 0 + assert groups[1].sm_count > 0 + total = groups[0].sm_count + groups[1].sm_count + rem.sm_count + assert total <= sm_resource.sm_count + + def test_two_groups_each_meets_request(self, sm_resource): + min_size = sm_resource.min_partition_size + half = _aligned_half(sm_resource) + if half < min_size: + pytest.skip("Not enough SMs for a 2-group split") + + groups, _ = sm_resource.split(SMResourceOptions(count=(min_size, min_size))) + + assert len(groups) == 2 + assert groups[0].sm_count >= min_size + assert groups[1].sm_count >= min_size + + def test_dry_run_matches_real(self, sm_resource): + """Dry-run reports the same SM counts as a real split.""" + opts = SMResourceOptions(count=None) + + dry_groups, _ = sm_resource.split(opts, dry_run=True) + real_groups, _ = sm_resource.split(opts, dry_run=False) + + assert len(dry_groups) == len(real_groups) + for dg, rg in zip(dry_groups, real_groups): + assert dg.sm_count == rg.sm_count + + +# --------------------------------------------------------------------------- +# Green context lifecycle +# --------------------------------------------------------------------------- + + +class TestGreenContextLifecycle: + def test_is_green(self, green_ctx): + assert green_ctx.is_green + assert green_ctx.handle is not None + + def test_set_current_swap_preserves_identity(self, init_cuda, green_ctx): + dev = init_cuda + with _use_green_ctx(dev, green_ctx): + pass # just verify push/pop works + # After exiting, primary context is restored. + # Verify we can swap in and get identity back: + prev = dev.set_current(green_ctx) + try: + pass + finally: + restored = dev.set_current(prev) + assert restored is green_ctx + assert restored.is_green + + def test_stream_and_event_track_green_context(self, green_ctx_active): + dev, green_ctx, stream = green_ctx_active - prev = dev.set_current(green_ctx) - try: - stream = dev.create_stream() event = stream.record() assert stream.context.is_green assert stream.context == green_ctx @@ -138,23 +323,82 @@ def test_green_context_push_model_creates_stream_and_event(init_cuda): assert event.context == green_ctx stream.sync() event.sync() - finally: - restored = dev.set_current(prev) - assert restored is green_ctx - restored.close() + def test_close_while_current_raises(self, init_cuda, green_ctx): + dev = init_cuda + with _use_green_ctx(dev, green_ctx), pytest.raises(RuntimeError, match="while it is current"): + green_ctx.close() -def test_close_current_green_context_raises(init_cuda): - dev = init_cuda - green_ctx = _green_context_or_skip(dev) +# --------------------------------------------------------------------------- +# Kernel launch in green context +# --------------------------------------------------------------------------- - prev = dev.set_current(green_ctx) - try: - with pytest.raises(RuntimeError, match="while it is current"): - green_ctx.close() - finally: - restored = dev.set_current(prev) - assert restored is green_ctx - restored.close() +def _launch_fill_and_verify(dev, stream, kernel, n, value): + """Launch the fill kernel and verify results on host.""" + dev_buf = dev.allocate(n * np.dtype(np.int32).itemsize, stream=stream) + + config = LaunchConfig(grid=(n + 31) // 32, block=32) + launch(stream, config, kernel, dev_buf, np.int32(value), np.int32(n)) + + host_mr = LegacyPinnedMemoryResource() + host_buf = host_mr.allocate(n * np.dtype(np.int32).itemsize) + host_arr = np.from_dlpack(host_buf).view(np.int32) + host_arr[:] = 0 + + dev_buf.copy_to(host_buf, stream=stream) + stream.sync() + + np.testing.assert_array_equal(host_arr, np.full(n, value, dtype=np.int32)) + + +class TestGreenContextKernelLaunch: + def test_launch_and_verify(self, green_ctx_active, fill_kernel): + """Compile, launch in green context, verify results on host.""" + dev, _, stream = green_ctx_active + _launch_fill_and_verify(dev, stream, fill_kernel, n=64, value=42) + + def test_two_green_contexts_independent(self, init_cuda, sm_resource, fill_kernel): + """Two SM groups -> two green contexts -> two independent kernels.""" + dev = init_cuda + half = _aligned_half(sm_resource) + if half < sm_resource.min_partition_size: + pytest.skip("Not enough SMs for a 2-group split") + + groups, _ = sm_resource.split(SMResourceOptions(count=(half, half))) + assert len(groups) == 2 + + ctx_a = ctx_b = None + try: + ctx_a = dev.create_context(ContextOptions(resources=[groups[0]])) + ctx_b = dev.create_context(ContextOptions(resources=[groups[1]])) + + for ctx, value in [(ctx_a, 10), (ctx_b, 20)]: + with _use_green_ctx(dev, ctx): + stream = dev.create_stream() + _launch_fill_and_verify(dev, stream, fill_kernel, n=64, value=value) + finally: + if ctx_b is not None: + ctx_b.close() + if ctx_a is not None: + ctx_a.close() + + def test_with_workqueue_resource(self, init_cuda, sm_resource, wq_resource, fill_kernel): + """Green context with SM + workqueue resources can launch a kernel.""" + dev = init_cuda + groups, _ = sm_resource.split(SMResourceOptions(count=None)) + + try: + ctx = dev.create_context(ContextOptions(resources=[groups[0], wq_resource])) + except CUDAError as exc: + pytest.skip(str(exc)) + + assert ctx.is_green + + try: + with _use_green_ctx(dev, ctx): + stream = dev.create_stream() + _launch_fill_and_verify(dev, stream, fill_kernel, n=32, value=99) + finally: + ctx.close() From 12a146744ff9c9eac81b722bc0ae18282fd1f4d8 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 27 Apr 2026 03:06:14 +0000 Subject: [PATCH 07/11] Lower green context handling to Cython and simplify Context - Convert ContextOptions and SMResourceOptions/WorkqueueResourceOptions to cdef dataclasses for check_or_create_options compatibility. - Cache SM metadata in typed cdef fields; fall back to arch-based granularity on CUDA 12.x where CUdevSmResource lacks minSmPartitionSize/smCoscheduledAlignment. - Simplify Context to hold only ContextHandle (remove _h_green_ctx and _is_green fields). Green ctx association lives in ContextBox; is_green queries get_context_green_ctx() on demand. - ContextOptions.resources accepts Sequence only (no bare resource). Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_context.pxd | 2 - cuda_core/cuda/core/_context.pyx | 30 ++-- cuda_core/cuda/core/_device.pyx | 15 +- cuda_core/cuda/core/_device_resources.pxd | 8 +- cuda_core/cuda/core/_device_resources.pyx | 196 ++++++++++++++-------- 5 files changed, 149 insertions(+), 102 deletions(-) diff --git a/cuda_core/cuda/core/_context.pxd b/cuda_core/cuda/core/_context.pxd index 5d3ed9bba2f..92fa5700a06 100644 --- a/cuda_core/cuda/core/_context.pxd +++ b/cuda_core/cuda/core/_context.pxd @@ -13,9 +13,7 @@ cdef class Context: cdef: ContextHandle _h_context - GreenCtxHandle _h_green_ctx int _device_id - bint _is_green object __weakref__ @staticmethod diff --git a/cuda_core/cuda/core/_context.pyx b/cuda_core/cuda/core/_context.pyx index 727351fc809..ef2c4a89e17 100644 --- a/cuda_core/cuda/core/_context.pyx +++ b/cuda_core/cuda/core/_context.pyx @@ -2,9 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from collections.abc import Sequence from dataclasses import dataclass from cuda.bindings cimport cydriver +from cuda.core._device_resources import SMResource, WorkqueueResource from cuda.core._resource_handles cimport ( ContextHandle, GreenCtxHandle, @@ -21,6 +25,9 @@ from cuda.core._utils.cuda_utils cimport HANDLE_RETURN __all__ = ['Context', 'ContextOptions'] +DeviceResourcesT = Sequence[SMResource | WorkqueueResource] + + cdef class Context: """CUDA context wrapper. @@ -36,23 +43,17 @@ cdef class Context: """Create Context from existing ContextHandle (cdef-only factory).""" cdef Context ctx = cls.__new__(cls) ctx._h_context = h_context - ctx._h_green_ctx = get_context_green_ctx(h_context) ctx._device_id = device_id - ctx._is_green = ctx._h_green_ctx.get() != NULL return ctx @staticmethod cdef Context _from_green_ctx(type cls, GreenCtxHandle h_green_ctx, int device_id): """Create Context from an owning green context handle.""" - cdef Context ctx = cls.__new__(cls) - ctx._h_green_ctx = h_green_ctx - ctx._h_context = create_context_handle_from_green_ctx(h_green_ctx) - if not ctx._h_context: + cdef ContextHandle h_context = create_context_handle_from_green_ctx(h_green_ctx) + if not h_context: HANDLE_RETURN(get_last_error()) raise RuntimeError("Failed to create CUDA context view from green context") - ctx._device_id = device_id - ctx._is_green = True - return ctx + return Context._from_handle(cls, h_context, device_id) @property def handle(self): @@ -70,7 +71,9 @@ cdef class Context: @property def is_green(self) -> bool: """True if this context was created from device resources.""" - return bool(self._is_green) + if not self._h_context: + return False + return get_context_green_ctx(self._h_context).get() != NULL cpdef close(self): """Release this context wrapper's underlying CUDA handles.""" @@ -84,7 +87,6 @@ cdef class Context: "Restore a previous context before closing this context." ) self._h_context.reset() - self._h_green_ctx.reset() def __eq__(self, other): if not isinstance(other, Context): @@ -100,12 +102,12 @@ cdef class Context: @dataclass -class ContextOptions: +cdef class ContextOptions: """Options for context creation. Attributes ---------- - resources : Sequence[SMResource | WorkqueueResource], optional + resources : :obj:`~_context.DeviceResourcesT` Device resources used to create a green context. """ - resources: object = None + resources: DeviceResourcesT diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index df1f95f158b..1a96841ed09 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -7,7 +7,7 @@ from __future__ import annotations cimport cpython from cuda.bindings cimport cydriver -from cuda.core._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core._utils.cuda_utils cimport check_or_create_options, HANDLE_RETURN from libc.stdlib cimport free, malloc import threading @@ -15,11 +15,6 @@ import threading from cuda.core._context cimport Context from cuda.core._context import ContextOptions from cuda.core._device_resources cimport DeviceResources, SMResource, WorkqueueResource -from cuda.core._device_resources import ( - DeviceResources, - SMResource, - WorkqueueResource, -) from cuda.core._event cimport Event as cyEvent from cuda.core._event import Event, EventOptions from cuda.core._memory._buffer cimport Buffer, MemoryResource @@ -1309,15 +1304,11 @@ class Device: if options is None: raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") - assert_type(options, ContextOptions) + options = check_or_create_options(ContextOptions, options, "Context options") if options.resources is None: raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") - resources = options.resources - if isinstance(resources, (SMResource, WorkqueueResource)): - resources = (resources,) - else: - resources = tuple(resources) + resources = tuple(options.resources) if len(resources) == 0: raise ValueError("ContextOptions.resources must not be empty") diff --git a/cuda_core/cuda/core/_device_resources.pxd b/cuda_core/cuda/core/_device_resources.pxd index 0e614562d00..62dbf32d688 100644 --- a/cuda_core/cuda/core/_device_resources.pxd +++ b/cuda_core/cuda/core/_device_resources.pxd @@ -8,14 +8,18 @@ from cuda.bindings cimport cydriver cdef class SMResource: cdef: cydriver.CUdevResource _resource + unsigned int _sm_count + unsigned int _min_partition_size + unsigned int _coscheduled_alignment + unsigned int _flags bint _is_usable object __weakref__ @staticmethod - cdef SMResource _from_dev_resource(cydriver.CUdevResource res) + cdef SMResource _from_dev_resource(cydriver.CUdevResource res, int device_id) @staticmethod - cdef SMResource _from_dry_run_resource(cydriver.CUdevResource res) + cdef SMResource _from_split_resource(cydriver.CUdevResource res, SMResource parent, bint is_usable) cdef class WorkqueueResource: diff --git a/cuda_core/cuda/core/_device_resources.pyx b/cuda_core/cuda/core/_device_resources.pyx index 2db864b039e..78f6e70cf7f 100644 --- a/cuda_core/cuda/core/_device_resources.pyx +++ b/cuda_core/cuda/core/_device_resources.pyx @@ -12,7 +12,8 @@ from libc.stdlib cimport free, malloc from libc.string cimport memset from cuda.bindings cimport cydriver -from cuda.core._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core._utils.cuda_utils cimport check_or_create_options, HANDLE_RETURN +from cuda.core._utils.cuda_utils import is_sequence from cuda.core._utils.version cimport cy_binding_version, cy_driver_version @@ -25,7 +26,7 @@ __all__ = [ ] -cdef inline void _check_green_ctx_support() except *: +cdef inline int _check_green_ctx_support() except?-1: cdef tuple drv = cy_driver_version() cdef tuple bind = cy_binding_version() if drv < (12, 4, 0): @@ -38,9 +39,10 @@ cdef inline void _check_green_ctx_support() except *: "Green context support requires cuda.bindings 12.4 or newer. " f"Using cuda.bindings version {'.'.join(map(str, bind))}" ) + return 0 -cdef inline void _check_workqueue_support() except *: +cdef inline int _check_workqueue_support() except?-1: cdef tuple drv = cy_driver_version() cdef tuple bind = cy_binding_version() if drv < (13, 1, 0): @@ -53,10 +55,11 @@ cdef inline void _check_workqueue_support() except *: "WorkqueueResource requires cuda.bindings 13.1 or newer. " f"Using cuda.bindings version {'.'.join(map(str, bind))}" ) + return 0 @dataclass -class SMResourceOptions: +cdef class SMResourceOptions: """Options for :meth:`SMResource.split`. ``count`` determines the number of requested groups. Scalar ``count`` or @@ -70,61 +73,62 @@ class SMResourceOptions: @dataclass -class WorkqueueResourceOptions: +cdef class WorkqueueResourceOptions: """Options for :meth:`WorkqueueResource.configure`.""" sharing_scope: str | None = None -cdef inline bint _is_sequence(object value): - return ( - isinstance(value, SequenceABC) - and not isinstance(value, (str, bytes, bytearray)) - ) +cdef inline int _validate_split_field_length( + object value, str field_name, int n_groups, bint count_is_scalar +) except?-1: + if count_is_scalar: + if is_sequence(value): + raise ValueError( + f"{field_name} is a Sequence but count is scalar; " + "count must be a Sequence to specify multiple groups" + ) + elif is_sequence(value) and len(value) != n_groups: + raise ValueError( + f"{field_name} has length {len(value)}, expected {n_groups} " + "(must match count)" + ) + return 0 -cdef int _resolve_group_count(object options) except -1: +cdef int _resolve_group_count(SMResourceOptions options) except -1: cdef object count = options.count cdef int n_groups - cdef object value - cdef str field_name + cdef bint count_is_scalar if count is None or isinstance(count, int): n_groups = 1 - elif _is_sequence(count): + count_is_scalar = True + elif is_sequence(count): n_groups = len(count) if n_groups == 0: raise ValueError("count sequence must not be empty") + count_is_scalar = False else: raise TypeError(f"count must be int, Sequence, or None, got {type(count)}") - if n_groups == 1: - for field_name in ( - "coscheduled_sm_count", - "preferred_coscheduled_sm_count", - ): - value = getattr(options, field_name) - if _is_sequence(value): - raise ValueError( - f"{field_name} is a Sequence but count is scalar; " - "count must be a Sequence to specify multiple groups" - ) - else: - for field_name in ( - "coscheduled_sm_count", - "preferred_coscheduled_sm_count", - ): - value = getattr(options, field_name) - if _is_sequence(value) and len(value) != n_groups: - raise ValueError( - f"{field_name} has length {len(value)}, expected {n_groups} " - "(must match count)" - ) + _validate_split_field_length( + options.coscheduled_sm_count, + "coscheduled_sm_count", + n_groups, + count_is_scalar, + ) + _validate_split_field_length( + options.preferred_coscheduled_sm_count, + "preferred_coscheduled_sm_count", + n_groups, + count_is_scalar, + ) return n_groups cdef object _broadcast_field(object value, int n_groups): - if _is_sequence(value): + if is_sequence(value): return list(value) return [value] * n_groups @@ -150,7 +154,7 @@ cdef inline bint _can_use_structured_sm_split(): return False -cdef inline void _check_split_by_count_support() except *: +cdef inline int _check_split_by_count_support() except?-1: cdef tuple drv = cy_driver_version() cdef tuple bind = cy_binding_version() if drv < (12, 4, 0): @@ -163,9 +167,10 @@ cdef inline void _check_split_by_count_support() except *: "SMResource.split() requires cuda.bindings 12.4 or newer. " f"Using cuda.bindings version {'.'.join(map(str, bind))}" ) + return 0 -cdef object _resolve_split_by_count_request(object options): +cdef object _resolve_split_by_count_request(SMResourceOptions options): cdef int n_groups = _resolve_group_count(options) cdef list counts = _broadcast_field(options.count, n_groups) cdef object first = counts[0] @@ -195,11 +200,11 @@ cdef object _resolve_split_by_count_request(object options): IF CUDA_CORE_BUILD_MAJOR >= 13: - cdef void _fill_group_params( + cdef int _fill_group_params( cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS* params, int n_groups, - object options, - ) except *: + SMResourceOptions options, + ) except?-1: cdef list counts = _broadcast_field(options.count, n_groups) cdef list coscheduled = _broadcast_field(options.coscheduled_sm_count, n_groups) cdef list preferred = _broadcast_field(options.preferred_coscheduled_sm_count, n_groups) @@ -215,9 +220,10 @@ IF CUDA_CORE_BUILD_MAJOR >= 13: preferred[i], "preferred_coscheduled_sm_count" ) params[i].flags = 0 + return 0 - cdef object _split_with_general_api(SMResource sm, object options, bint dry_run): + cdef object _split_with_general_api(SMResource sm, SMResourceOptions options, bint dry_run): cdef int n_groups = _resolve_group_count(options) cdef cydriver.CUdevResource* result = NULL cdef cydriver.CUdevResource remaining @@ -255,28 +261,28 @@ IF CUDA_CORE_BUILD_MAJOR >= 13: if result != NULL: for i in range(n_groups): - groups.append(SMResource._from_dev_resource(result[i])) - return groups, SMResource._from_dev_resource(remaining) + groups.append(SMResource._from_split_resource(result[i], sm, True)) + return groups, SMResource._from_split_resource(remaining, sm, True) for i in range(n_groups): memset(&synth, 0, sizeof(cydriver.CUdevResource)) synth.type = cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM synth.sm.smCount = params[i].smCount - groups.append(SMResource._from_dry_run_resource(synth)) - return groups, SMResource._from_dry_run_resource(remaining) + groups.append(SMResource._from_split_resource(synth, sm, False)) + return groups, SMResource._from_split_resource(remaining, sm, False) finally: if params != NULL: free(params) if result != NULL: free(result) ELSE: - cdef object _split_with_general_api(SMResource sm, object options, bint dry_run): + cdef object _split_with_general_api(SMResource sm, SMResourceOptions options, bint dry_run): raise NotImplementedError( "SMResource.split() requires cuda.core to be built with CUDA 13.x bindings" ) -cdef object _split_with_count_api(SMResource sm, object options, bint dry_run): +cdef object _split_with_count_api(SMResource sm, SMResourceOptions options, bint dry_run): cdef object request = _resolve_split_by_count_request(options) cdef unsigned int nb_groups = request[0] cdef unsigned int min_count = request[1] @@ -304,16 +310,36 @@ cdef object _split_with_count_api(SMResource sm, object options, bint dry_run): for i in range(actual_groups): if dry_run: - groups.append(SMResource._from_dry_run_resource(result[i])) + groups.append(SMResource._from_split_resource(result[i], sm, False)) else: - groups.append(SMResource._from_dev_resource(result[i])) + groups.append(SMResource._from_split_resource(result[i], sm, True)) if dry_run: - return groups, SMResource._from_dry_run_resource(remaining) - return groups, SMResource._from_dev_resource(remaining) + return groups, SMResource._from_split_resource(remaining, sm, False) + return groups, SMResource._from_split_resource(remaining, sm, True) finally: free(result) +cdef inline unsigned int _sm_resource_granularity(int device_id) except? 0: + cdef int major + + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetAttribute( + &major, + cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device_id, + )) + if major >= 9: + return 8 + return 2 + + +cdef inline unsigned int _fallback_if_zero(unsigned int value, unsigned int fallback) noexcept: + if value != 0: + return value + return fallback + + cdef class SMResource: """SM resource queried from a device. Not user-constructible.""" @@ -324,17 +350,41 @@ cdef class SMResource: ) @staticmethod - cdef SMResource _from_dev_resource(cydriver.CUdevResource res): + cdef SMResource _from_dev_resource(cydriver.CUdevResource res, int device_id): cdef SMResource self = SMResource.__new__(SMResource) self._resource = res + self._sm_count = res.sm.smCount + IF CUDA_CORE_BUILD_MAJOR >= 13: + self._min_partition_size = res.sm.minSmPartitionSize + self._coscheduled_alignment = res.sm.smCoscheduledAlignment + self._flags = res.sm.flags + ELSE: + self._min_partition_size = _sm_resource_granularity(device_id) + self._coscheduled_alignment = self._min_partition_size + self._flags = 0 self._is_usable = True return self @staticmethod - cdef SMResource _from_dry_run_resource(cydriver.CUdevResource res): + cdef SMResource _from_split_resource(cydriver.CUdevResource res, SMResource parent, bint is_usable): cdef SMResource self = SMResource.__new__(SMResource) self._resource = res - self._is_usable = False + self._sm_count = res.sm.smCount + IF CUDA_CORE_BUILD_MAJOR >= 13: + self._min_partition_size = _fallback_if_zero( + res.sm.minSmPartitionSize, + parent._min_partition_size, + ) + self._coscheduled_alignment = _fallback_if_zero( + res.sm.smCoscheduledAlignment, + parent._coscheduled_alignment, + ) + self._flags = res.sm.flags + ELSE: + self._min_partition_size = parent._min_partition_size + self._coscheduled_alignment = parent._coscheduled_alignment + self._flags = parent._flags + self._is_usable = is_usable return self @property @@ -345,33 +395,34 @@ cdef class SMResource: @property def sm_count(self) -> int: """Total SMs available in this resource.""" - return self._resource.sm.smCount + return self._sm_count @property def min_partition_size(self) -> int: """Minimum SM count required to create a partition.""" - return self._resource.sm.minSmPartitionSize + return self._min_partition_size @property def coscheduled_alignment(self) -> int: """Number of SMs guaranteed to be co-scheduled.""" - return self._resource.sm.smCoscheduledAlignment + return self._coscheduled_alignment @property def flags(self) -> int: """Raw flags from the underlying SM resource.""" - return self._resource.sm.flags + return self._flags def split(self, options not None, *, bint dry_run=False): """Split this SM resource into groups plus a remainder.""" - if not isinstance(options, SMResourceOptions): - raise TypeError(f"options must be SMResourceOptions, got {type(options)}") - _resolve_group_count(options) + cdef SMResourceOptions opts = check_or_create_options( + SMResourceOptions, options, "SM resource options" + ) + _resolve_group_count(opts) _check_green_ctx_support() if _can_use_structured_sm_split(): - return _split_with_general_api(self, options, dry_run) + return _split_with_general_api(self, opts, dry_run) _check_split_by_count_support() - return _split_with_count_api(self, options, dry_run) + return _split_with_count_api(self, opts, dry_run) cdef class WorkqueueResource: @@ -400,25 +451,26 @@ cdef class WorkqueueResource: def configure(self, options not None): """Configure the workqueue resource in place.""" + cdef WorkqueueResourceOptions opts = check_or_create_options( + WorkqueueResourceOptions, options, "Workqueue resource options" + ) _check_green_ctx_support() _check_workqueue_support() - if not isinstance(options, WorkqueueResourceOptions): - raise TypeError(f"options must be WorkqueueResourceOptions, got {type(options)}") - if options.sharing_scope is None: + if opts.sharing_scope is None: return None IF CUDA_CORE_BUILD_MAJOR >= 13: - if options.sharing_scope == "device_ctx": + if opts.sharing_scope == "device_ctx": self._wq_config_resource.wqConfig.sharingScope = ( cydriver.CUdevWorkqueueConfigScope.CU_WORKQUEUE_SCOPE_DEVICE_CTX ) - elif options.sharing_scope == "green_ctx_balanced": + elif opts.sharing_scope == "green_ctx_balanced": self._wq_config_resource.wqConfig.sharingScope = ( cydriver.CUdevWorkqueueConfigScope.CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED ) else: raise ValueError( - f"Unknown sharing_scope: {options.sharing_scope!r}. " + f"Unknown sharing_scope: {opts.sharing_scope!r}. " "Expected 'device_ctx' or 'green_ctx_balanced'." ) ELSE: @@ -453,7 +505,7 @@ cdef class DeviceResources: &res, cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM, )) - return SMResource._from_dev_resource(res) + return SMResource._from_dev_resource(res, self._device_id) @property def workqueue(self) -> WorkqueueResource: From b6959e4c614154b430ff467418251abb730eab60 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 27 Apr 2026 03:06:29 +0000 Subject: [PATCH 08/11] Add explicit green context model: ctx.create_stream and ctx.resources Switch from the push model (dev.set_current + dev.create_stream) to the explicit model (ctx.create_stream + ctx.resources) as the primary way to use green contexts. Context.create_stream(options): - Only supported on green contexts (raises on primary contexts). - Delegates to Stream._init, which calls create_stream_handle in C++. - C++ create_stream_handle auto-dispatches: checks get_context_green_ctx and calls cuGreenCtxStreamCreate for green contexts, or cuStreamCreateWithPriority for primary. Single function, no duplication. Context.resources: - Returns a DeviceResources namespace querying this context's resources (cuCtxGetDevResource / cuGreenCtxGetDevResource), not the full device. dev.set_current(green_ctx) still works but is not the recommended path. Tests rewritten to use the explicit model throughout. Push-model set_current kept as regression tests with _use_green_ctx helper. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_context.pyx | 42 ++++++ cuda_core/cuda/core/_cpp/resource_handles.cpp | 15 ++- cuda_core/cuda/core/_cpp/resource_handles.hpp | 2 + cuda_core/cuda/core/_device_resources.pxd | 7 + cuda_core/cuda/core/_device_resources.pyx | 87 ++++++++++--- cuda_core/cuda/core/_resource_handles.pyx | 2 + cuda_core/cuda/core/_stream.pyx | 3 +- cuda_core/tests/test_green_context.py | 120 ++++++++++++------ 8 files changed, 213 insertions(+), 65 deletions(-) diff --git a/cuda_core/cuda/core/_context.pyx b/cuda_core/cuda/core/_context.pyx index ef2c4a89e17..a8bddf67dc8 100644 --- a/cuda_core/cuda/core/_context.pyx +++ b/cuda_core/cuda/core/_context.pyx @@ -8,6 +8,7 @@ from collections.abc import Sequence from dataclasses import dataclass from cuda.bindings cimport cydriver +from cuda.core._device_resources cimport DeviceResources, SMResource, WorkqueueResource from cuda.core._device_resources import SMResource, WorkqueueResource from cuda.core._resource_handles cimport ( ContextHandle, @@ -19,6 +20,7 @@ from cuda.core._resource_handles cimport ( as_intptr, as_py, ) +from cuda.core._stream import Stream, StreamOptions from cuda.core._utils.cuda_utils cimport HANDLE_RETURN @@ -75,6 +77,46 @@ cdef class Context: return False return get_context_green_ctx(self._h_context).get() != NULL + @property + def resources(self) -> DeviceResources: + """Query the hardware resources provisioned for this context. + + For green contexts, returns the resources this context was created + with (SM partition, workqueue config). For primary contexts, returns + the full device resources. + + Raises :class:`RuntimeError` if the context has been closed. + """ + if not self._h_context: + raise RuntimeError("Cannot query resources on a closed context") + return DeviceResources._init_from_ctx(self._h_context, self._device_id) + + def create_stream(self, options: StreamOptions | None = None): + """Create a new stream bound to this green context. + + This method is only available on green contexts. For primary + contexts, use :meth:`Device.create_stream` instead. + + Parameters + ---------- + options : :obj:`~_stream.StreamOptions`, optional + Customizable dataclass for stream creation options. + + Returns + ------- + :obj:`~_stream.Stream` + Newly created stream object. + """ + if not self._h_context: + raise RuntimeError("Cannot create a stream on a closed context") + if not self.is_green: + raise RuntimeError( + "Context.create_stream() is only supported on green contexts. " + "Use Device.create_stream() for primary contexts." + ) + + return Stream._init(options=options, device_id=self._device_id, ctx=self) + cpdef close(self): """Release this context wrapper's underlying CUDA handles.""" cdef cydriver.CUcontext current_ctx diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 3119f546b55..261efcfb368 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -34,6 +34,8 @@ decltype(&cuGreenCtxDestroy) p_cuGreenCtxDestroy = nullptr; decltype(&cuCtxFromGreenCtx) p_cuCtxFromGreenCtx = nullptr; decltype(&cuDevResourceGenerateDesc) p_cuDevResourceGenerateDesc = nullptr; +decltype(&cuGreenCtxStreamCreate) p_cuGreenCtxStreamCreate = nullptr; + decltype(&cuStreamCreateWithPriority) p_cuStreamCreateWithPriority = nullptr; decltype(&cuStreamDestroy) p_cuStreamDestroy = nullptr; @@ -409,8 +411,17 @@ static HandleRegistry stream_registry; StreamHandle create_stream_handle(const ContextHandle& h_ctx, unsigned int flags, int priority) { GILReleaseGuard gil; CUstream stream; - if (CUDA_SUCCESS != (err = p_cuStreamCreateWithPriority(&stream, flags, priority))) { - return {}; + + // Dispatch: green context uses cuGreenCtxStreamCreate, primary uses cuStreamCreateWithPriority + GreenCtxHandle h_green = get_context_green_ctx(h_ctx); + if (h_green && p_cuGreenCtxStreamCreate) { + if (CUDA_SUCCESS != (err = p_cuGreenCtxStreamCreate(&stream, as_cu(h_green), flags, priority))) { + return {}; + } + } else { + if (CUDA_SUCCESS != (err = p_cuStreamCreateWithPriority(&stream, flags, priority))) { + return {}; + } } auto box = std::shared_ptr( diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 0f91113e4e4..73d3364ba5f 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -64,6 +64,8 @@ extern decltype(&cuGreenCtxDestroy) p_cuGreenCtxDestroy; extern decltype(&cuCtxFromGreenCtx) p_cuCtxFromGreenCtx; extern decltype(&cuDevResourceGenerateDesc) p_cuDevResourceGenerateDesc; +extern decltype(&cuGreenCtxStreamCreate) p_cuGreenCtxStreamCreate; + extern decltype(&cuStreamCreateWithPriority) p_cuStreamCreateWithPriority; extern decltype(&cuStreamDestroy) p_cuStreamDestroy; diff --git a/cuda_core/cuda/core/_device_resources.pxd b/cuda_core/cuda/core/_device_resources.pxd index 62dbf32d688..d618c24cf10 100644 --- a/cuda_core/cuda/core/_device_resources.pxd +++ b/cuda_core/cuda/core/_device_resources.pxd @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver +from cuda.core._resource_handles cimport ContextHandle, GreenCtxHandle cdef class SMResource: @@ -38,7 +39,13 @@ cdef class WorkqueueResource: cdef class DeviceResources: cdef: int _device_id + ContextHandle _h_context # NULL for device-level queries object __weakref__ @staticmethod cdef DeviceResources _init(int device_id) + + @staticmethod + cdef DeviceResources _init_from_ctx(ContextHandle h_context, int device_id) + + cdef inline int _query_sm(self, cydriver.CUdevResource* res) except?-1 nogil diff --git a/cuda_core/cuda/core/_device_resources.pyx b/cuda_core/cuda/core/_device_resources.pyx index 78f6e70cf7f..683cdf9cd9e 100644 --- a/cuda_core/cuda/core/_device_resources.pyx +++ b/cuda_core/cuda/core/_device_resources.pyx @@ -12,6 +12,7 @@ from libc.stdlib cimport free, malloc from libc.string cimport memset from cuda.bindings cimport cydriver +from cuda.core._resource_handles cimport ContextHandle, GreenCtxHandle, as_cu, get_context_green_ctx from cuda.core._utils.cuda_utils cimport check_or_create_options, HANDLE_RETURN from cuda.core._utils.cuda_utils import is_sequence from cuda.core._utils.version cimport cy_binding_version, cy_driver_version @@ -480,53 +481,99 @@ cdef class WorkqueueResource: cdef class DeviceResources: - """Namespace for hardware resource query. Not user-constructible.""" + """Namespace for hardware resource query. Not user-constructible. + + When obtained via ``dev.resources``, queries return full device resources. + When obtained via ``ctx.resources``, queries return the resources + provisioned for that context. + """ def __init__(self, *args, **kwargs): raise RuntimeError( "DeviceResources cannot be instantiated directly. " - "Use dev.resources." + "Use dev.resources or ctx.resources." ) @staticmethod cdef DeviceResources _init(int device_id): cdef DeviceResources self = DeviceResources.__new__(DeviceResources) self._device_id = device_id + # _h_context is default empty — queries use cuDeviceGetDevResource return self + @staticmethod + cdef DeviceResources _init_from_ctx(ContextHandle h_context, int device_id): + cdef DeviceResources self = DeviceResources.__new__(DeviceResources) + self._device_id = device_id + self._h_context = h_context + return self + + cdef inline int _query_sm(self, cydriver.CUdevResource* res) except?-1 nogil: + """Query SM resource from either device or context.""" + cdef GreenCtxHandle h_green + if self._h_context: + h_green = get_context_green_ctx(self._h_context) + if h_green: + HANDLE_RETURN(cydriver.cuGreenCtxGetDevResource( + as_cu(h_green), res, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM, + )) + else: + HANDLE_RETURN(cydriver.cuCtxGetDevResource( + as_cu(self._h_context), res, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM, + )) + else: + HANDLE_RETURN(cydriver.cuDeviceGetDevResource( + self._device_id, res, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM, + )) + return 0 + @property def sm(self) -> SMResource: - """Query SM resources from this device.""" + """Query SM resources.""" _check_green_ctx_support() cdef cydriver.CUdevResource res with nogil: - HANDLE_RETURN(cydriver.cuDeviceGetDevResource( - self._device_id, - &res, - cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM, - )) + self._query_sm(&res) return SMResource._from_dev_resource(res, self._device_id) @property def workqueue(self) -> WorkqueueResource: - """Query workqueue resources from this device.""" + """Query workqueue resources.""" _check_green_ctx_support() _check_workqueue_support() cdef cydriver.CUdevResource _wq_config cdef cydriver.CUdevResource _wq IF CUDA_CORE_BUILD_MAJOR >= 13: - with nogil: - HANDLE_RETURN(cydriver.cuDeviceGetDevResource( - self._device_id, - &_wq_config, - cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG, - )) - HANDLE_RETURN(cydriver.cuDeviceGetDevResource( - self._device_id, - &_wq, - cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE, - )) + if self._h_context: + # Context-level query + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetDevResource( + as_cu(self._h_context), + &_wq_config, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG, + )) + HANDLE_RETURN(cydriver.cuCtxGetDevResource( + as_cu(self._h_context), + &_wq, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE, + )) + else: + # Device-level query + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetDevResource( + self._device_id, + &_wq_config, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG, + )) + HANDLE_RETURN(cydriver.cuDeviceGetDevResource( + self._device_id, + &_wq, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE, + )) return WorkqueueResource._from_dev_resources(_wq_config, _wq) ELSE: raise NotImplementedError( diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 748a1c49e67..59e47f23462 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -239,6 +239,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": void* p_cuGreenCtxDestroy "reinterpret_cast(cuda_core::p_cuGreenCtxDestroy)" void* p_cuCtxFromGreenCtx "reinterpret_cast(cuda_core::p_cuCtxFromGreenCtx)" void* p_cuDevResourceGenerateDesc "reinterpret_cast(cuda_core::p_cuDevResourceGenerateDesc)" + void* p_cuGreenCtxStreamCreate "reinterpret_cast(cuda_core::p_cuGreenCtxStreamCreate)" # Stream void* p_cuStreamCreateWithPriority "reinterpret_cast(cuda_core::p_cuStreamCreateWithPriority)" @@ -320,6 +321,7 @@ p_cuGreenCtxCreate = _get_optional_driver_fn("cuGreenCtxCreate") p_cuGreenCtxDestroy = _get_optional_driver_fn("cuGreenCtxDestroy") p_cuCtxFromGreenCtx = _get_optional_driver_fn("cuCtxFromGreenCtx") p_cuDevResourceGenerateDesc = _get_optional_driver_fn("cuDevResourceGenerateDesc") +p_cuGreenCtxStreamCreate = _get_optional_driver_fn("cuGreenCtxStreamCreate") # Stream p_cuStreamCreateWithPriority = _get_driver_fn("cuStreamCreateWithPriority") diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index 0c1ab6f4b2c..d3b41139436 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -153,7 +153,8 @@ cdef class Stream: else: prio = high - # C++ creates the stream and returns owning handle with context dependency + # C++ creates the stream and returns owning handle with context dependency. + # For green contexts, the C++ layer auto-dispatches to cuGreenCtxStreamCreate. h_stream = create_stream_handle(h_context, flags, prio) if not h_stream: raise RuntimeError("Failed to create CUDA stream") diff --git a/cuda_core/tests/test_green_context.py b/cuda_core/tests/test_green_context.py index 54e01b2f0f0..bf1f72cfb6e 100644 --- a/cuda_core/tests/test_green_context.py +++ b/cuda_core/tests/test_green_context.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 + import contextlib import numpy as np @@ -71,21 +72,6 @@ def green_ctx(init_cuda, sm_resource): ctx.close() -@pytest.fixture -def green_ctx_active(init_cuda, green_ctx): - """Set a green context as current and restore the previous on teardown. - - Yields (dev, green_ctx, stream) for use in kernel launch tests. - """ - dev = init_cuda - prev = dev.set_current(green_ctx) - try: - stream = dev.create_stream() - yield dev, green_ctx, stream - finally: - dev.set_current(prev) - - @pytest.fixture def fill_kernel(init_cuda): """Compile the fill kernel for the current device.""" @@ -299,23 +285,23 @@ def test_is_green(self, green_ctx): assert green_ctx.is_green assert green_ctx.handle is not None - def test_set_current_swap_preserves_identity(self, init_cuda, green_ctx): - dev = init_cuda - with _use_green_ctx(dev, green_ctx): - pass # just verify push/pop works - # After exiting, primary context is restored. - # Verify we can swap in and get identity back: - prev = dev.set_current(green_ctx) - try: - pass - finally: - restored = dev.set_current(prev) - assert restored is green_ctx - assert restored.is_green - - def test_stream_and_event_track_green_context(self, green_ctx_active): - dev, green_ctx, stream = green_ctx_active + def test_create_stream_on_primary_raises(self, init_cuda): + """create_stream is only for green contexts.""" + # The init_cuda fixture sets the primary context + # Get the primary context via device internals + ctx = init_cuda._context + with pytest.raises(RuntimeError, match="only supported on green contexts"): + ctx.create_stream() + + def test_create_stream_explicit(self, green_ctx): + """Create a stream directly from the green context (no set_current).""" + stream = green_ctx.create_stream() + assert stream is not None + assert stream.context.is_green + assert stream.context == green_ctx + def test_stream_and_event_track_green_context(self, green_ctx): + stream = green_ctx.create_stream() event = stream.record() assert stream.context.is_green assert stream.context == green_ctx @@ -325,13 +311,65 @@ def test_stream_and_event_track_green_context(self, green_ctx_active): event.sync() def test_close_while_current_raises(self, init_cuda, green_ctx): + """close() on a current context raises — test via set_current.""" dev = init_cuda with _use_green_ctx(dev, green_ctx), pytest.raises(RuntimeError, match="while it is current"): green_ctx.close() + def test_set_current_swap_regression(self, init_cuda, green_ctx): + """set_current still works (backward compat) and preserves identity.""" + dev = init_cuda + with _use_green_ctx(dev, green_ctx): + pass # just verify push/pop works + # Swap again and check identity round-trip + prev = dev.set_current(green_ctx) + try: + assert prev is not None + finally: + restored = dev.set_current(prev) + assert restored is green_ctx + assert restored.is_green + + +# --------------------------------------------------------------------------- +# Context.resources +# --------------------------------------------------------------------------- + + +class TestContextResources: + def test_green_ctx_sm_resources(self, green_ctx, sm_resource): + """Green context's SM resources should be a subset of device SMs.""" + ctx_sm = green_ctx.resources.sm + assert ctx_sm.sm_count > 0 + assert ctx_sm.sm_count <= sm_resource.sm_count + + def test_green_ctx_resources_reflect_partition(self, init_cuda, sm_resource): + """Two green contexts should have disjoint SM partitions.""" + half = _aligned_half(sm_resource) + if half < sm_resource.min_partition_size: + pytest.skip("Not enough SMs for a 2-group split") + + groups, _ = sm_resource.split(SMResourceOptions(count=(half, half))) + + ctx_a = ctx_b = None + try: + ctx_a = init_cuda.create_context(ContextOptions(resources=[groups[0]])) + ctx_b = init_cuda.create_context(ContextOptions(resources=[groups[1]])) + + sm_a = ctx_a.resources.sm.sm_count + sm_b = ctx_b.resources.sm.sm_count + assert sm_a > 0 + assert sm_b > 0 + assert sm_a + sm_b <= sm_resource.sm_count + finally: + if ctx_b is not None: + ctx_b.close() + if ctx_a is not None: + ctx_a.close() + # --------------------------------------------------------------------------- -# Kernel launch in green context +# Kernel launch in green context (explicit model) # --------------------------------------------------------------------------- @@ -354,10 +392,10 @@ def _launch_fill_and_verify(dev, stream, kernel, n, value): class TestGreenContextKernelLaunch: - def test_launch_and_verify(self, green_ctx_active, fill_kernel): - """Compile, launch in green context, verify results on host.""" - dev, _, stream = green_ctx_active - _launch_fill_and_verify(dev, stream, fill_kernel, n=64, value=42) + def test_launch_and_verify(self, init_cuda, green_ctx, fill_kernel): + """Launch kernel via ctx.create_stream (explicit model, no set_current).""" + stream = green_ctx.create_stream() + _launch_fill_and_verify(init_cuda, stream, fill_kernel, n=64, value=42) def test_two_green_contexts_independent(self, init_cuda, sm_resource, fill_kernel): """Two SM groups -> two green contexts -> two independent kernels.""" @@ -375,9 +413,8 @@ def test_two_green_contexts_independent(self, init_cuda, sm_resource, fill_kerne ctx_b = dev.create_context(ContextOptions(resources=[groups[1]])) for ctx, value in [(ctx_a, 10), (ctx_b, 20)]: - with _use_green_ctx(dev, ctx): - stream = dev.create_stream() - _launch_fill_and_verify(dev, stream, fill_kernel, n=64, value=value) + stream = ctx.create_stream() + _launch_fill_and_verify(dev, stream, fill_kernel, n=64, value=value) finally: if ctx_b is not None: ctx_b.close() @@ -397,8 +434,7 @@ def test_with_workqueue_resource(self, init_cuda, sm_resource, wq_resource, fill assert ctx.is_green try: - with _use_green_ctx(dev, ctx): - stream = dev.create_stream() - _launch_fill_and_verify(dev, stream, fill_kernel, n=32, value=99) + stream = ctx.create_stream() + _launch_fill_and_verify(dev, stream, fill_kernel, n=32, value=99) finally: ctx.close() From 2812c5bacfd322ec5de24714f6cbfd109985d54e Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 27 Apr 2026 03:55:44 +0000 Subject: [PATCH 09/11] Harden green context stream creation and resource queries - Let the driver validate the nonblocking flag for green context streams: cuGreenCtxStreamCreate rejects CU_STREAM_DEFAULT. On failure, check if the context is green + nonblocking is False and raise a clear ValueError. - cuCtxGetStreamPriorityRange failure (CUDA_ERROR_INVALID_CONTEXT) now raises: "No current CUDA context. Call dev.set_current() before creating streams." - C++ create_stream_handle returns CUDA_ERROR_NOT_SUPPORTED if the context is green but cuGreenCtxStreamCreate is unavailable (CUDA < 12.5), instead of falling through to cuStreamCreateWithPriority. - ctx.resources.workqueue now dispatches to cuGreenCtxGetDevResource for green contexts, matching the SM query path. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 6 ++- cuda_core/cuda/core/_device_resources.pyx | 40 +++++++++++++------ cuda_core/cuda/core/_stream.pyx | 26 +++++++++++- cuda_core/cuda/core/typing.py | 2 + cuda_core/docs/source/api_private.rst | 1 + cuda_core/tests/test_green_context.py | 7 ++++ 6 files changed, 67 insertions(+), 15 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 261efcfb368..2413d9473c7 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -414,7 +414,11 @@ StreamHandle create_stream_handle(const ContextHandle& h_ctx, unsigned int flags // Dispatch: green context uses cuGreenCtxStreamCreate, primary uses cuStreamCreateWithPriority GreenCtxHandle h_green = get_context_green_ctx(h_ctx); - if (h_green && p_cuGreenCtxStreamCreate) { + if (h_green) { + if (!p_cuGreenCtxStreamCreate) { + err = CUDA_ERROR_NOT_SUPPORTED; + return {}; + } if (CUDA_SUCCESS != (err = p_cuGreenCtxStreamCreate(&stream, as_cu(h_green), flags, priority))) { return {}; } diff --git a/cuda_core/cuda/core/_device_resources.pyx b/cuda_core/cuda/core/_device_resources.pyx index 683cdf9cd9e..b7537cf8593 100644 --- a/cuda_core/cuda/core/_device_resources.pyx +++ b/cuda_core/cuda/core/_device_resources.pyx @@ -548,19 +548,35 @@ cdef class DeviceResources: cdef cydriver.CUdevResource _wq IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef GreenCtxHandle h_green if self._h_context: - # Context-level query - with nogil: - HANDLE_RETURN(cydriver.cuCtxGetDevResource( - as_cu(self._h_context), - &_wq_config, - cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG, - )) - HANDLE_RETURN(cydriver.cuCtxGetDevResource( - as_cu(self._h_context), - &_wq, - cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE, - )) + h_green = get_context_green_ctx(self._h_context) + if h_green: + # Green context query + with nogil: + HANDLE_RETURN(cydriver.cuGreenCtxGetDevResource( + as_cu(h_green), + &_wq_config, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG, + )) + HANDLE_RETURN(cydriver.cuGreenCtxGetDevResource( + as_cu(h_green), + &_wq, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE, + )) + else: + # Primary context query + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetDevResource( + as_cu(self._h_context), + &_wq_config, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG, + )) + HANDLE_RETURN(cydriver.cuCtxGetDevResource( + as_cu(self._h_context), + &_wq, + cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE, + )) else: # Device-level query with nogil: diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index d3b41139436..783df99b11c 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -31,6 +31,7 @@ from cuda.core._resource_handles cimport ( create_stream_handle, create_stream_handle_with_owner, get_current_context, + get_last_error, get_legacy_stream, get_per_thread_stream, get_stream_context, @@ -143,8 +144,15 @@ cdef class Stream: else cydriver.CUstream_flags.CU_STREAM_DEFAULT) # TODO: we might want to consider memoizing high/low per CUDA context and avoid this call cdef int high, low + cdef cydriver.CUresult res_code with nogil: - HANDLE_RETURN(cydriver.cuCtxGetStreamPriorityRange(&high, &low)) + res_code = cydriver.cuCtxGetStreamPriorityRange(&high, &low) + if res_code != cydriver.CUresult.CUDA_SUCCESS: + if res_code == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT: + raise RuntimeError( + "No current CUDA context. Call dev.set_current() before creating streams." + ) + HANDLE_RETURN(res_code) cdef int prio if priority is not None: prio = priority @@ -157,7 +165,21 @@ cdef class Stream: # For green contexts, the C++ layer auto-dispatches to cuGreenCtxStreamCreate. h_stream = create_stream_handle(h_context, flags, prio) if not h_stream: - raise RuntimeError("Failed to create CUDA stream") + res_code = get_last_error() + if not nonblocking and res_code == cydriver.CUresult.CUDA_ERROR_INVALID_VALUE: + # cuGreenCtxStreamCreate rejects CU_STREAM_DEFAULT; + # no need to check is_green since primary streams don't fail this way + raise ValueError( + "Green context streams must be non-blocking. " + "Use StreamOptions(nonblocking=True) or omit the option (True is the default)." + ) + elif res_code == cydriver.CUresult.CUDA_ERROR_NOT_SUPPORTED: + raise RuntimeError( + "cuGreenCtxStreamCreate is not available. " + "Green context stream creation requires CUDA 12.5 or newer." + ) + else: + HANDLE_RETURN(res_code) self = Stream._from_handle(cls, h_stream) self._nonblocking = int(nonblocking) self._priority = prio diff --git a/cuda_core/cuda/core/typing.py b/cuda_core/cuda/core/typing.py index a66ab1881fb..922e6b0ae6e 100644 --- a/cuda_core/cuda/core/typing.py +++ b/cuda_core/cuda/core/typing.py @@ -4,10 +4,12 @@ """Public type aliases and protocols used in cuda.core API signatures.""" +from cuda.core._context import DeviceResourcesT from cuda.core._memory._buffer import DevicePointerT from cuda.core._stream import IsStreamT __all__ = [ "DevicePointerT", + "DeviceResourcesT", "IsStreamT", ] diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index 141773967e8..95f68482ef3 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -17,6 +17,7 @@ CUDA runtime :toctree: generated/ typing.DevicePointerT + typing.DeviceResourcesT _memory._virtual_memory_resource.VirtualMemoryAllocationTypeT _memory._virtual_memory_resource.VirtualMemoryLocationTypeT _memory._virtual_memory_resource.VirtualMemoryGranularityT diff --git a/cuda_core/tests/test_green_context.py b/cuda_core/tests/test_green_context.py index bf1f72cfb6e..2cc2eb6579b 100644 --- a/cuda_core/tests/test_green_context.py +++ b/cuda_core/tests/test_green_context.py @@ -293,6 +293,13 @@ def test_create_stream_on_primary_raises(self, init_cuda): with pytest.raises(RuntimeError, match="only supported on green contexts"): ctx.create_stream() + def test_create_stream_blocking_raises(self, green_ctx): + """Green context streams must be non-blocking.""" + from cuda.core import StreamOptions + + with pytest.raises(ValueError, match="must be non-blocking"): + green_ctx.create_stream(StreamOptions(nonblocking=False)) + def test_create_stream_explicit(self, green_ctx): """Create a stream directly from the green context (no set_current).""" stream = green_ctx.create_stream() From 340506ee95bfa51a00b5770d4eb91f13366f8a94 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 27 Apr 2026 12:30:20 +0000 Subject: [PATCH 10/11] Add Stream.resources property Stream.resources delegates to DeviceResources._init_from_ctx via the stream's tracked context handle, returning the same resource view as ctx.resources for the stream's parent context. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_stream.pyx | 13 +++++++++++++ cuda_core/tests/test_green_context.py | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index 783df99b11c..e3865bcc542 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -21,6 +21,7 @@ from dataclasses import dataclass from typing import Protocol from cuda.core._context cimport Context +from cuda.core._device_resources cimport DeviceResources from cuda.core._event import Event, EventOptions from cuda.core._resource_handles cimport ( ContextHandle, @@ -346,6 +347,18 @@ cdef class Stream: Stream_ensure_ctx_device(self) return Context._from_handle(Context, self._h_context, self._device_id) + @property + def resources(self): + """Query the hardware resources provisioned for this stream's context. + + For streams created from a green context, returns the resources + that context was provisioned with. For streams on the primary + context, returns the full device resources. + """ + Stream_ensure_ctx(self) + Stream_ensure_ctx_device(self) + return DeviceResources._init_from_ctx(self._h_context, self._device_id) + @staticmethod def from_handle(handle: int) -> Stream: """Create a new :obj:`~_stream.Stream` object from a foreign stream handle. diff --git a/cuda_core/tests/test_green_context.py b/cuda_core/tests/test_green_context.py index 2cc2eb6579b..8264a587d84 100644 --- a/cuda_core/tests/test_green_context.py +++ b/cuda_core/tests/test_green_context.py @@ -374,6 +374,24 @@ def test_green_ctx_resources_reflect_partition(self, init_cuda, sm_resource): if ctx_a is not None: ctx_a.close() + def test_stream_resources_match_context(self, green_ctx, sm_resource): + """stream.resources should return the same as ctx.resources.""" + stream = green_ctx.create_stream() + + stream_sm = stream.resources.sm + ctx_sm = green_ctx.resources.sm + assert stream_sm.sm_count == ctx_sm.sm_count + assert stream_sm.sm_count > 0 + assert stream_sm.sm_count <= sm_resource.sm_count + + try: + stream_wq = stream.resources.workqueue + ctx_wq = green_ctx.resources.workqueue + assert stream_wq.handle != 0 + assert ctx_wq.handle != 0 + except (NotImplementedError, CUDAError): + pass # workqueue not available on this driver/build + # --------------------------------------------------------------------------- # Kernel launch in green context (explicit model) From fa254a514dbd1ec051e0673a9b4271bd770b8aee Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 27 Apr 2026 12:36:35 +0000 Subject: [PATCH 11/11] Polish green context API: docs, error handling, simplification - dev.create_context raises ValueError (not NotImplementedError) when options or resources are missing. - Cache version checks (_check_green_ctx_support, _check_workqueue_support) at module level; raise ValueError instead of NotImplementedError. - Simplify _device_resources.pyx: merge _as_uint and _count_to_sm_count into _to_sm_count; inline unsigned int casts for coscheduled params. - Add green context classes to api.rst (Context, ContextOptions, DeviceResources, SMResource, SMResourceOptions, WorkqueueResource, WorkqueueResourceOptions). - Update all docstrings to NumPy style with Attributes/Parameters/Returns sections matching the existing codebase convention. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_context.pyx | 2 +- cuda_core/cuda/core/_device.pyx | 14 +- cuda_core/cuda/core/_device_resources.pyx | 230 ++++++++++++++-------- cuda_core/docs/source/api.rst | 6 + cuda_core/docs/source/api_private.rst | 1 + cuda_core/tests/test_green_context.py | 12 +- 6 files changed, 171 insertions(+), 94 deletions(-) diff --git a/cuda_core/cuda/core/_context.pyx b/cuda_core/cuda/core/_context.pyx index a8bddf67dc8..225500c7093 100644 --- a/cuda_core/cuda/core/_context.pyx +++ b/cuda_core/cuda/core/_context.pyx @@ -149,7 +149,7 @@ cdef class ContextOptions: Attributes ---------- - resources : :obj:`~_context.DeviceResourcesT` + resources : :obj:`~cuda.core.typing.DeviceResourcesT` Device resources used to create a green context. """ resources: DeviceResourcesT diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index 1a96841ed09..32b96acb99d 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -1302,11 +1302,15 @@ class Device: cdef GreenCtxHandle h_green if options is None: - raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") + raise ValueError( + "options with device resources must be provided to create a green context" + ) options = check_or_create_options(ContextOptions, options, "Context options") if options.resources is None: - raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") + raise ValueError( + "ContextOptions.resources must be provided to create a green context" + ) resources = tuple(options.resources) if len(resources) == 0: @@ -1334,9 +1338,9 @@ class Device: h_green = create_green_ctx_handle( c_resources, - n_resources, - self._device_id, - cydriver.CUgreenCtxCreate_flags.CU_GREEN_CTX_DEFAULT_STREAM, + (n_resources), + (self._device_id), + (cydriver.CUgreenCtxCreate_flags.CU_GREEN_CTX_DEFAULT_STREAM), ) if h_green.get() == NULL: HANDLE_RETURN(get_last_error()) diff --git a/cuda_core/cuda/core/_device_resources.pyx b/cuda_core/cuda/core/_device_resources.pyx index b7537cf8593..e4851a73a41 100644 --- a/cuda_core/cuda/core/_device_resources.pyx +++ b/cuda_core/cuda/core/_device_resources.pyx @@ -27,45 +27,85 @@ __all__ = [ ] +# Module-level cached version checks (trinary: 0=unchecked, 1=supported, -1=unsupported) +cdef int _green_ctx_checked = 0 +cdef int _workqueue_checked = 0 +cdef str _green_ctx_err_msg = "" +cdef str _workqueue_err_msg = "" + + cdef inline int _check_green_ctx_support() except?-1: + global _green_ctx_checked, _green_ctx_err_msg + if _green_ctx_checked == 1: + return 0 + if _green_ctx_checked == -1: + raise RuntimeError(_green_ctx_err_msg) cdef tuple drv = cy_driver_version() cdef tuple bind = cy_binding_version() if drv < (12, 4, 0): - raise NotImplementedError( - "Green context support requires CUDA driver 12.4 or newer. " - f"Using driver version {'.'.join(map(str, drv))}" + _green_ctx_err_msg = ( + "Green context support requires CUDA driver 12.4 or newer " + f"(current driver: {'.'.join(map(str, drv))})" ) + _green_ctx_checked = -1 + raise RuntimeError(_green_ctx_err_msg) if bind < (12, 4, 0): - raise NotImplementedError( - "Green context support requires cuda.bindings 12.4 or newer. " - f"Using cuda.bindings version {'.'.join(map(str, bind))}" + _green_ctx_err_msg = ( + "Green context support requires cuda.bindings 12.4 or newer " + f"(current bindings: {'.'.join(map(str, bind))})" ) + _green_ctx_checked = -1 + raise RuntimeError(_green_ctx_err_msg) + _green_ctx_checked = 1 return 0 cdef inline int _check_workqueue_support() except?-1: + global _workqueue_checked, _workqueue_err_msg + if _workqueue_checked == 1: + return 0 + if _workqueue_checked == -1: + raise RuntimeError(_workqueue_err_msg) cdef tuple drv = cy_driver_version() cdef tuple bind = cy_binding_version() if drv < (13, 1, 0): - raise NotImplementedError( - "WorkqueueResource requires CUDA driver 13.1 or newer. " - f"Using driver version {'.'.join(map(str, drv))}" + _workqueue_err_msg = ( + "WorkqueueResource requires CUDA driver 13.1 or newer " + f"(current driver: {'.'.join(map(str, drv))})" ) + _workqueue_checked = -1 + raise RuntimeError(_workqueue_err_msg) if bind < (13, 1, 0): - raise NotImplementedError( - "WorkqueueResource requires cuda.bindings 13.1 or newer. " - f"Using cuda.bindings version {'.'.join(map(str, bind))}" + _workqueue_err_msg = ( + "WorkqueueResource requires cuda.bindings 13.1 or newer " + f"(current bindings: {'.'.join(map(str, bind))})" ) + _workqueue_checked = -1 + raise RuntimeError(_workqueue_err_msg) + _workqueue_checked = 1 return 0 @dataclass cdef class SMResourceOptions: - """Options for :meth:`SMResource.split`. - - ``count`` determines the number of requested groups. Scalar ``count`` or - ``None`` creates one group; a sequence creates ``len(count)`` groups. Other - sequence fields must match the length of ``count``. + """Customizable :obj:`SMResource.split` options. + + Each field accepts a scalar (for a single group) or a ``Sequence`` + (for multiple groups). ``count`` drives the number of groups; other + ``Sequence`` fields must match its length. + + Attributes + ---------- + count : int or Sequence[int], optional + Requested SM count per group. ``None`` means discovery mode + (auto-detect). (Default to ``None``) + coscheduled_sm_count : int or Sequence[int], optional + Minimum number of SMs guaranteed to be co-scheduled in each + group. (Default to ``None``) + preferred_coscheduled_sm_count : int or Sequence[int], optional + Preferred co-scheduled SM count; the driver tries to satisfy + this but may fall back to ``coscheduled_sm_count``. + (Default to ``None``) """ count: int | SequenceABC | None = None @@ -75,7 +115,14 @@ cdef class SMResourceOptions: @dataclass cdef class WorkqueueResourceOptions: - """Options for :meth:`WorkqueueResource.configure`.""" + """Customizable :obj:`WorkqueueResource.configure` options. + + Attributes + ---------- + sharing_scope : str, optional + Workqueue sharing scope. Accepted values: ``"device_ctx"`` + or ``"green_ctx_balanced"``. (Default to ``None``) + """ sharing_scope: str | None = None @@ -97,7 +144,7 @@ cdef inline int _validate_split_field_length( return 0 -cdef int _resolve_group_count(SMResourceOptions options) except -1: +cdef inline int _resolve_group_count(SMResourceOptions options) except?-1: cdef object count = options.count cdef int n_groups cdef bint count_is_scalar @@ -128,47 +175,34 @@ cdef int _resolve_group_count(SMResourceOptions options) except -1: return n_groups -cdef object _broadcast_field(object value, int n_groups): +cdef inline object _broadcast_field(object value, int n_groups): if is_sequence(value): return list(value) return [value] * n_groups -cdef inline unsigned int _as_uint(object value, str field_name) except? 0: - if not isinstance(value, int): - raise TypeError(f"{field_name} must be an int or None, got {type(value)}") - if value < 0: - raise ValueError(f"{field_name} must be non-negative") - return value - - -cdef inline unsigned int _count_to_sm_count(object value) except? 0: +cdef inline unsigned int _to_sm_count(object value) except? 0: + """Convert a count value to unsigned int. None maps to 0 (discovery).""" if value is None: return 0 - return _as_uint(value, "count") + if value < 0: + raise ValueError(f"count must be non-negative, got {value}") + return (value) + +cdef int _structured_split_checked = 0 cdef inline bint _can_use_structured_sm_split(): + """Check if cuDevSmResourceSplit (13.1+) is available. Cached.""" + global _structured_split_checked + if _structured_split_checked != 0: + return _structured_split_checked == 1 IF CUDA_CORE_BUILD_MAJOR >= 13: - return cy_driver_version() >= (13, 1, 0) and cy_binding_version() >= (13, 1, 0) - ELSE: - return False - - -cdef inline int _check_split_by_count_support() except?-1: - cdef tuple drv = cy_driver_version() - cdef tuple bind = cy_binding_version() - if drv < (12, 4, 0): - raise NotImplementedError( - "SMResource.split() requires CUDA driver 12.4 or newer. " - f"Using driver version {'.'.join(map(str, drv))}" - ) - if bind < (12, 4, 0): - raise NotImplementedError( - "SMResource.split() requires cuda.bindings 12.4 or newer. " - f"Using cuda.bindings version {'.'.join(map(str, bind))}" - ) - return 0 + if cy_driver_version() >= (13, 1, 0) and cy_binding_version() >= (13, 1, 0): + _structured_split_checked = 1 + return True + _structured_split_checked = -1 + return False cdef object _resolve_split_by_count_request(SMResourceOptions options): @@ -179,29 +213,29 @@ cdef object _resolve_split_by_count_request(SMResourceOptions options): cdef unsigned int min_count if options.coscheduled_sm_count is not None: - raise NotImplementedError( + raise RuntimeError( "SMResourceOptions.coscheduled_sm_count requires the CUDA 13.1 " "structured SM split API" ) if options.preferred_coscheduled_sm_count is not None: - raise NotImplementedError( + raise RuntimeError( "SMResourceOptions.preferred_coscheduled_sm_count requires the " "CUDA 13.1 structured SM split API" ) for value in counts[1:]: if value != first: - raise NotImplementedError( + raise RuntimeError( "CUDA 12 SM splitting only supports homogeneous count values; " "use CUDA 13.1 or newer for per-group counts" ) - min_count = _count_to_sm_count(first) + min_count = _to_sm_count(first) return n_groups, min_count IF CUDA_CORE_BUILD_MAJOR >= 13: - cdef int _fill_group_params( + cdef inline int _fill_group_params( cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS* params, int n_groups, SMResourceOptions options, @@ -213,13 +247,11 @@ IF CUDA_CORE_BUILD_MAJOR >= 13: for i in range(n_groups): memset(¶ms[i], 0, sizeof(cydriver.CU_DEV_SM_RESOURCE_GROUP_PARAMS)) - params[i].smCount = _count_to_sm_count(counts[i]) + params[i].smCount = _to_sm_count(counts[i]) if coscheduled[i] is not None: - params[i].coscheduledSmCount = _as_uint(coscheduled[i], "coscheduled_sm_count") + params[i].coscheduledSmCount = (coscheduled[i]) if preferred[i] is not None: - params[i].preferredCoscheduledSmCount = _as_uint( - preferred[i], "preferred_coscheduled_sm_count" - ) + params[i].preferredCoscheduledSmCount = (preferred[i]) params[i].flags = 0 return 0 @@ -253,7 +285,7 @@ IF CUDA_CORE_BUILD_MAJOR >= 13: with nogil: HANDLE_RETURN(cydriver.cuDevSmResourceSplit( result, - n_groups, + (n_groups), &sm._resource, &remaining, 0, @@ -278,15 +310,15 @@ IF CUDA_CORE_BUILD_MAJOR >= 13: free(result) ELSE: cdef object _split_with_general_api(SMResource sm, SMResourceOptions options, bint dry_run): - raise NotImplementedError( + raise RuntimeError( "SMResource.split() requires cuda.core to be built with CUDA 13.x bindings" ) cdef object _split_with_count_api(SMResource sm, SMResourceOptions options, bint dry_run): cdef object request = _resolve_split_by_count_request(options) - cdef unsigned int nb_groups = request[0] - cdef unsigned int min_count = request[1] + cdef unsigned int nb_groups = (request[0]) + cdef unsigned int min_count = (request[1]) cdef unsigned int actual_groups = nb_groups cdef cydriver.CUdevResource* result = NULL cdef cydriver.CUdevResource remaining @@ -328,7 +360,7 @@ cdef inline unsigned int _sm_resource_granularity(int device_id) except? 0: HANDLE_RETURN(cydriver.cuDeviceGetAttribute( &major, cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device_id, + (device_id), )) if major >= 9: return 8 @@ -342,7 +374,11 @@ cdef inline unsigned int _fallback_if_zero(unsigned int value, unsigned int fall cdef class SMResource: - """SM resource queried from a device. Not user-constructible.""" + """Represent an SM (streaming multiprocessor) resource partition. + + Instances are returned by :obj:`DeviceResources.sm` or + :meth:`SMResource.split` and cannot be instantiated directly. + """ def __init__(self, *args, **kwargs): raise RuntimeError( @@ -391,7 +427,7 @@ cdef class SMResource: @property def handle(self) -> int: """Return the address of the underlying ``CUdevResource`` struct.""" - return &self._resource + return (&self._resource) @property def sm_count(self) -> int: @@ -414,7 +450,22 @@ cdef class SMResource: return self._flags def split(self, options not None, *, bint dry_run=False): - """Split this SM resource into groups plus a remainder.""" + """Split this SM resource into groups and a remainder. + + Parameters + ---------- + options : :obj:`SMResourceOptions` + Split configuration (count, co-scheduling constraints). + dry_run : bool, optional + If ``True``, return filled-in metadata without creating + usable resource objects. (Default to ``False``) + + Returns + ------- + tuple[list[:obj:`SMResource`], :obj:`SMResource`] + ``(groups, remainder)`` where each group holds a disjoint + SM partition and *remainder* holds any unassigned SMs. + """ cdef SMResourceOptions opts = check_or_create_options( SMResourceOptions, options, "SM resource options" ) @@ -422,12 +473,18 @@ cdef class SMResource: _check_green_ctx_support() if _can_use_structured_sm_split(): return _split_with_general_api(self, opts, dry_run) - _check_split_by_count_support() + # SplitByCount requires the same 12.4+ as green ctx support (already checked above) return _split_with_count_api(self, opts, dry_run) cdef class WorkqueueResource: - """Workqueue resource. Not user-constructible.""" + """Represent a workqueue resource for a device or green context. + + Merges ``CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG`` and + ``CU_DEV_RESOURCE_TYPE_WORKQUEUE`` under one user-facing type. + Instances are returned by :obj:`DeviceResources.workqueue` and + cannot be instantiated directly. + """ def __init__(self, *args, **kwargs): raise RuntimeError( @@ -448,10 +505,16 @@ cdef class WorkqueueResource: @property def handle(self) -> int: """Return the address of the underlying config ``CUdevResource`` struct.""" - return &self._wq_config_resource + return (&self._wq_config_resource) def configure(self, options not None): - """Configure the workqueue resource in place.""" + """Configure the workqueue resource in place. + + Parameters + ---------- + options : :obj:`WorkqueueResourceOptions` + Configuration options (sharing scope, etc.). + """ cdef WorkqueueResourceOptions opts = check_or_create_options( WorkqueueResourceOptions, options, "Workqueue resource options" ) @@ -475,17 +538,20 @@ cdef class WorkqueueResource: "Expected 'device_ctx' or 'green_ctx_balanced'." ) ELSE: - raise NotImplementedError( + raise RuntimeError( "WorkqueueResource requires cuda.core to be built with CUDA 13.x bindings" ) cdef class DeviceResources: - """Namespace for hardware resource query. Not user-constructible. + """Namespace for hardware resource queries. + + When obtained via :obj:`Device.resources`, queries return full device + resources. When obtained via :obj:`Context.resources` or + :obj:`Stream.resources`, queries return the resources provisioned for + that context. - When obtained via ``dev.resources``, queries return full device resources. - When obtained via ``ctx.resources``, queries return the resources - provisioned for that context. + This class cannot be instantiated directly. """ def __init__(self, *args, **kwargs): @@ -525,14 +591,14 @@ cdef class DeviceResources: )) else: HANDLE_RETURN(cydriver.cuDeviceGetDevResource( - self._device_id, res, + (self._device_id), res, cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM, )) return 0 @property def sm(self) -> SMResource: - """Query SM resources.""" + """Return the :obj:`SMResource` for this device or context.""" _check_green_ctx_support() cdef cydriver.CUdevResource res with nogil: @@ -541,7 +607,7 @@ cdef class DeviceResources: @property def workqueue(self) -> WorkqueueResource: - """Query workqueue resources.""" + """Return the :obj:`WorkqueueResource` for this device or context.""" _check_green_ctx_support() _check_workqueue_support() cdef cydriver.CUdevResource _wq_config @@ -581,17 +647,17 @@ cdef class DeviceResources: # Device-level query with nogil: HANDLE_RETURN(cydriver.cuDeviceGetDevResource( - self._device_id, + (self._device_id), &_wq_config, cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG, )) HANDLE_RETURN(cydriver.cuDeviceGetDevResource( - self._device_id, + (self._device_id), &_wq, cydriver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_WORKQUEUE, )) return WorkqueueResource._from_dev_resources(_wq_config, _wq) ELSE: - raise NotImplementedError( + raise RuntimeError( "WorkqueueResource requires cuda.core to be built with CUDA 13.x bindings" ) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 88780732d54..8d591316f91 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -26,12 +26,18 @@ Devices and execution Stream Event + Context + SMResource + WorkqueueResource :template: dataclass.rst StreamOptions EventOptions LaunchConfig + ContextOptions + SMResourceOptions + WorkqueueResourceOptions .. data:: LEGACY_DEFAULT_STREAM diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index 95f68482ef3..de100c7152e 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -31,6 +31,7 @@ CUDA runtime :template: autosummary/cyclass.rst _device.DeviceProperties + _device_resources.DeviceResources _memory._ipc.IPCAllocationHandle _memory._ipc.IPCBufferDescriptor diff --git a/cuda_core/tests/test_green_context.py b/cuda_core/tests/test_green_context.py index 8264a587d84..8eb32f7c1aa 100644 --- a/cuda_core/tests/test_green_context.py +++ b/cuda_core/tests/test_green_context.py @@ -47,7 +47,7 @@ def sm_resource(init_cuda): """Query SM resources from the device, skip if unsupported.""" try: return init_cuda.resources.sm - except (NotImplementedError, CUDAError) as exc: + except (RuntimeError, ValueError, CUDAError) as exc: pytest.skip(str(exc)) @@ -56,7 +56,7 @@ def wq_resource(init_cuda): """Query workqueue resources from the device, skip if unsupported.""" try: return init_cuda.resources.workqueue - except (NotImplementedError, CUDAError) as exc: + except (RuntimeError, ValueError, CUDAError) as exc: pytest.skip(str(exc)) @@ -113,10 +113,10 @@ def test_not_user_constructible(): WorkqueueResource() -def test_create_context_without_resources_stays_unimplemented(init_cuda): - with pytest.raises(NotImplementedError): +def test_create_context_requires_resources(init_cuda): + with pytest.raises(ValueError, match="resources must be provided"): init_cuda.create_context() - with pytest.raises(NotImplementedError): + with pytest.raises(ValueError, match="resources must be provided"): init_cuda.create_context(ContextOptions(resources=None)) with pytest.raises(TypeError): init_cuda.create_context(object()) @@ -389,7 +389,7 @@ def test_stream_resources_match_context(self, green_ctx, sm_resource): ctx_wq = green_ctx.resources.workqueue assert stream_wq.handle != 0 assert ctx_wq.handle != 0 - except (NotImplementedError, CUDAError): + except (RuntimeError, ValueError, CUDAError): pass # workqueue not available on this driver/build