diff --git a/python/cuda_cccl/cuda/compute/_nvtx.py b/python/cuda_cccl/cuda/compute/_nvtx.py new file mode 100644 index 00000000000..364af934b6b --- /dev/null +++ b/python/cuda_cccl/cuda/compute/_nvtx.py @@ -0,0 +1,53 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +NVTX annotation utilities for cuda.compute module. +Uses NVIDIA green (76B900) color and cuda.compute domain. +""" + +import functools + +import nvtx + +# NVIDIA green color hex value (76B900) +NVIDIA_GREEN = 0x76B900 + +# Domain name for cuda.compute annotations +COMPUTE_DOMAIN = "cuda.compute" + + +def annotate(message=None, domain=None, category=None, color=None): + """ + Decorator to annotate functions with NVTX markers. + + Args: + message: Optional message to display. If None, uses the function name. + domain: Optional NVTX domain string. Defaults to "cuda.compute". + category: Optional category for the annotation. + color: Optional color in hexadecimal format (0xRRGGBB). Defaults to NVIDIA green (0x76B900). + + Returns: + Decorated function with NVTX annotations. + """ + + def decorator(func): + # Use function name if no message is provided + annotation_message = message if message is not None else func.__name__ + annotation_domain = domain if domain is not None else COMPUTE_DOMAIN + annotation_color = color if color is not None else NVIDIA_GREEN + + @functools.wraps(func) + def wrapper(*args, **kwargs): + with nvtx.annotate( + annotation_message, + domain=annotation_domain, + color=annotation_color, + category=category, + ): + return func(*args, **kwargs) + + return wrapper + + return decorator diff --git a/python/cuda_cccl/cuda/compute/algorithms/_histogram.py b/python/cuda_cccl/cuda/compute/algorithms/_histogram.py index 9e15d3c3ff0..563fd30d418 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_histogram.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_histogram.py @@ -11,6 +11,7 @@ from .. import _cccl_interop as cccl from .._caching import cache_with_key from .._cccl_interop import call_build, set_cccl_iterator_state, to_cccl_value_state +from .._nvtx import annotate from .._utils.protocols import get_data_pointer, get_dtype, validate_and_get_stream from .._utils.temp_storage_buffer import TempStorageBuffer from ..iterators._iterators import IteratorBase @@ -90,6 +91,7 @@ def __init__( is_evenly_segmented, ) + @annotate(message="_Histogram.__call__") def __call__( self, temp_storage, @@ -134,6 +136,7 @@ def __call__( return temp_storage_bytes +@annotate() @cache_with_key(make_cache_key) def make_histogram_even( d_samples: DeviceArrayLike | IteratorBase, @@ -173,6 +176,7 @@ def make_histogram_even( ) +@annotate() def histogram_even( d_samples: DeviceArrayLike | IteratorBase, d_histogram: DeviceArrayLike, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_reduce.py b/python/cuda_cccl/cuda/compute/algorithms/_reduce.py index d527a40a4e9..859fd339ff8 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_reduce.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_reduce.py @@ -16,6 +16,7 @@ set_cccl_iterator_state, to_cccl_value_state, ) +from .._nvtx import annotate from .._utils import protocols from .._utils.protocols import get_data_pointer, validate_and_get_stream from .._utils.temp_storage_buffer import TempStorageBuffer @@ -140,6 +141,7 @@ def _make_reduce_into_cached( # TODO Figure out `sum` without operator and initial value # TODO Accept stream +@annotate() def make_reduce_into( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, @@ -170,6 +172,7 @@ def make_reduce_into( return _make_reduce_into_cached(d_in, d_out, op_adapter, h_init, **kwargs) +@annotate() def reduce_into( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_scan.py b/python/cuda_cccl/cuda/compute/algorithms/_scan.py index ef9861aca22..c71edb0eb4c 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_scan.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_scan.py @@ -17,6 +17,7 @@ set_cccl_iterator_state, to_cccl_value_state, ) +from .._nvtx import annotate from .._utils import protocols from .._utils.protocols import get_data_pointer, validate_and_get_stream from .._utils.temp_storage_buffer import TempStorageBuffer @@ -114,6 +115,7 @@ def __init__( case (False, _bindings.InitKind.NO_INIT): raise ValueError("Exclusive scan with No init value is not supported") + @annotate(message="_Scan.__call__") def __call__( self, temp_storage, @@ -214,6 +216,7 @@ def _make_inclusive_scan_cached( # TODO Figure out `sum` without operator and initial value # TODO Accept stream +@annotate() def make_exclusive_scan( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, @@ -243,6 +246,7 @@ def make_exclusive_scan( return _make_exclusive_scan_cached(d_in, d_out, op_adapter, init_value) +@annotate() def exclusive_scan( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, @@ -280,6 +284,7 @@ def exclusive_scan( # TODO Figure out `sum` without operator and initial value # TODO Accept stream +@annotate() def make_inclusive_scan( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, @@ -309,6 +314,7 @@ def make_inclusive_scan( return _make_inclusive_scan_cached(d_in, d_out, op_adapter, init_value) +@annotate() def inclusive_scan( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py b/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py index ff50b294b69..eadc7bf00d1 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py @@ -11,6 +11,7 @@ set_cccl_iterator_state, to_cccl_value_state, ) +from .._nvtx import annotate from .._utils import protocols from .._utils.protocols import ( get_data_pointer, @@ -84,6 +85,7 @@ def __init__( self.h_init_cccl, ) + @annotate(message="_SegmentedReduce.__call__") def __call__( self, temp_storage, @@ -171,6 +173,7 @@ def _make_segmented_reduce_cached( return _SegmentedReduce(d_in, d_out, start_offsets_in, end_offsets_in, op, h_init) +@annotate() def make_segmented_reduce( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, @@ -206,6 +209,7 @@ def make_segmented_reduce( ) +@annotate() def segmented_reduce( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_select.py b/python/cuda_cccl/cuda/compute/algorithms/_select.py index 89d26a268bd..7c01f2427c0 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_select.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_select.py @@ -6,6 +6,7 @@ from typing import Callable from .._caching import cache_with_key +from .._nvtx import annotate from .._utils import protocols from .._utils.temp_storage_buffer import TempStorageBuffer from ..iterators._factories import DiscardIterator @@ -62,6 +63,7 @@ def _cccl_always_false(x): _cccl_always_false, # select_second_part_op - always false ) + @annotate(message="_Select.__call__") def __call__( self, temp_storage, @@ -97,6 +99,7 @@ def _make_select_cached( return _Select(d_in, d_out, d_num_selected_out, cond) +@annotate() def make_select( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, @@ -131,6 +134,7 @@ def make_select( return _make_select_cached(d_in, d_out, d_num_selected_out, cond_adapter) +@annotate() def select( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py index 84d5134c537..41f058dabc4 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py @@ -11,6 +11,7 @@ from ... import _cccl_interop as cccl from ..._caching import cache_with_key from ..._cccl_interop import call_build, set_cccl_iterator_state +from ..._nvtx import annotate from ..._utils import protocols from ..._utils.protocols import ( get_data_pointer, @@ -103,6 +104,7 @@ def __init__( self.op_cccl, ) + @annotate(message="_MergeSort.__call__") def __call__( self, temp_storage, @@ -161,6 +163,7 @@ def _make_merge_sort_cached( return _MergeSort(d_in_keys, d_in_items, d_out_keys, d_out_items, op) +@annotate() def make_merge_sort( d_in_keys: DeviceArrayLike | IteratorBase, d_in_items: DeviceArrayLike | IteratorBase | None, @@ -194,6 +197,7 @@ def make_merge_sort( ) +@annotate() def merge_sort( d_in_keys: DeviceArrayLike | IteratorBase, d_in_items: DeviceArrayLike | IteratorBase | None, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py index 1080143018a..66c292b890a 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py @@ -7,6 +7,7 @@ from ... import _cccl_interop as cccl from ..._caching import cache_with_key from ..._cccl_interop import call_build, set_cccl_iterator_state +from ..._nvtx import annotate from ..._utils.protocols import ( get_data_pointer, get_dtype, @@ -94,6 +95,7 @@ def __init__( decomposer_return_type, ) + @annotate(message="_RadixSort.__call__") def __call__( self, temp_storage, @@ -164,6 +166,7 @@ def __call__( return temp_storage_bytes +@annotate() @cache_with_key(make_cache_key) def make_radix_sort( d_in_keys: DeviceArrayLike | DoubleBuffer, @@ -195,6 +198,7 @@ def make_radix_sort( return _RadixSort(d_in_keys, d_out_keys, d_in_values, d_out_values, order) +@annotate() def radix_sort( d_in_keys: DeviceArrayLike | DoubleBuffer, d_out_keys: DeviceArrayLike | None, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py index 2df9bcc3db7..d80a2532766 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py @@ -8,6 +8,7 @@ from ... import _cccl_interop as cccl from ..._caching import cache_with_key from ..._cccl_interop import call_build, set_cccl_iterator_state +from ..._nvtx import annotate from ..._utils.protocols import ( get_data_pointer, get_dtype, @@ -66,6 +67,7 @@ def __init__( self.end_offsets_in_cccl, ) + @annotate(message="_SegmentedSort.__call__") def __call__( self, temp_storage, @@ -166,6 +168,7 @@ def make_cache_key( ) +@annotate() @cache_with_key(make_cache_key) def make_segmented_sort( d_in_keys: DeviceArrayLike | DoubleBuffer, @@ -209,6 +212,7 @@ def make_segmented_sort( ) +@annotate() def segmented_sort( d_in_keys: DeviceArrayLike | DoubleBuffer, d_out_keys: DeviceArrayLike | None, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py b/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py index 9727d6f4ce5..cdcf64eab94 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py @@ -11,6 +11,7 @@ from .. import _cccl_interop as cccl from .._caching import cache_with_key from .._cccl_interop import call_build, set_cccl_iterator_state +from .._nvtx import annotate from .._utils import protocols from .._utils.temp_storage_buffer import TempStorageBuffer from ..iterators._iterators import IteratorBase @@ -112,6 +113,7 @@ def __init__( self.select_second_part_op_cccl, ) + @annotate(message="_ThreeWayPartition.__call__") def __call__( self, temp_storage, @@ -176,6 +178,7 @@ def _make_three_way_partition_cached( ) +@annotate() def make_three_way_partition( d_in: DeviceArrayLike | IteratorBase, d_first_part_out: DeviceArrayLike | IteratorBase, @@ -223,6 +226,7 @@ def make_three_way_partition( ) +@annotate() def three_way_partition( d_in: DeviceArrayLike | IteratorBase, d_first_part_out: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_transform.py b/python/cuda_cccl/cuda/compute/algorithms/_transform.py index 82abe8a7fae..fae1264c7fd 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_transform.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_transform.py @@ -9,6 +9,7 @@ from .. import _cccl_interop as cccl from .._caching import cache_with_key from .._cccl_interop import set_cccl_iterator_state +from .._nvtx import annotate from .._utils import protocols from ..iterators._iterators import IteratorBase from ..op import OpAdapter, OpKind, make_op_adapter @@ -39,6 +40,7 @@ def __init__( self.op_cccl, ) + @annotate(message="_UnaryTransform.__call__") def __call__( self, d_in, @@ -95,6 +97,7 @@ def __init__( self.op_cccl, ) + @annotate(message="_BinaryTransform.__call__") def __call__( self, d_in1, @@ -172,6 +175,7 @@ def _make_binary_transform_cached( return _BinaryTransform(d_in1, d_in2, d_out, op) +@annotate() def make_unary_transform( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, @@ -202,6 +206,7 @@ def make_unary_transform( return _make_unary_transform_cached(d_in, d_out, op_adapter) +@annotate() def make_binary_transform( d_in1: DeviceArrayLike | IteratorBase, d_in2: DeviceArrayLike | IteratorBase, @@ -234,6 +239,7 @@ def make_binary_transform( return _make_binary_transform_cached(d_in1, d_in2, d_out, op_adapter) +@annotate() def unary_transform( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, @@ -276,6 +282,7 @@ def unary_transform( transformer(d_in, d_out, num_items, stream) +@annotate() def binary_transform( d_in1: DeviceArrayLike | IteratorBase, d_in2: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py b/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py index 8f9c014feb0..0de37825129 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py @@ -11,6 +11,7 @@ from .. import _cccl_interop as cccl from .._caching import cache_with_key from .._cccl_interop import call_build, set_cccl_iterator_state +from .._nvtx import annotate from .._utils import protocols from .._utils.protocols import ( get_data_pointer, @@ -103,6 +104,7 @@ def __init__( self.op_cccl, ) + @annotate(message="_UniqueByKey.__call__") def __call__( self, temp_storage, @@ -160,6 +162,7 @@ def _make_unique_by_key_cached( ) +@annotate() def make_unique_by_key( d_in_keys: DeviceArrayLike | IteratorBase, d_in_items: DeviceArrayLike | IteratorBase, @@ -200,6 +203,7 @@ def make_unique_by_key( ) +@annotate() def unique_by_key( d_in_keys: DeviceArrayLike | IteratorBase, d_in_items: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/iterators/_factories.py b/python/cuda_cccl/cuda/compute/iterators/_factories.py index 5023e660600..d398498b1cc 100644 --- a/python/cuda_cccl/cuda/compute/iterators/_factories.py +++ b/python/cuda_cccl/cuda/compute/iterators/_factories.py @@ -1,5 +1,6 @@ import numba +from .._nvtx import annotate from ._iterators import ( CacheModifiedPointer as _CacheModifiedPointer, ) @@ -20,6 +21,7 @@ from ._zip_iterator import make_zip_iterator +@annotate() def CacheModifiedInputIterator(device_array, modifier): """Random Access Cache Modified Iterator that wraps a native device pointer. @@ -50,6 +52,7 @@ def CacheModifiedInputIterator(device_array, modifier): ) +@annotate() def ConstantIterator(value): """Returns an Iterator representing a sequence of constant values. @@ -73,6 +76,7 @@ def ConstantIterator(value): return _ConstantIterator(value) +@annotate() def CountingIterator(offset): """Returns an Iterator representing a sequence of incrementing values. @@ -96,6 +100,7 @@ def CountingIterator(offset): return _CountingIterator(offset) +@annotate() def DiscardIterator(reference_iterator=None): """Returns an Input or Output Iterator that discards all values written to it. @@ -119,6 +124,7 @@ def DiscardIterator(reference_iterator=None): return _DiscardIterator(reference_iterator) +@annotate() def ReverseIterator(sequence): """Returns an Iterator over an array or another iterator in reverse. @@ -147,6 +153,7 @@ def ReverseIterator(sequence): return make_reverse_iterator(sequence) +@annotate() def TransformIterator(it, op): """An iterator that applies a user-defined unary function to the elements of an underlying iterator as they are read. @@ -169,6 +176,7 @@ def TransformIterator(it, op): return make_transform_iterator(it, op, "input") +@annotate() def TransformOutputIterator(it, op): """An iterator that applies a user-defined unary function to values before writing them to an underlying iterator. @@ -192,6 +200,7 @@ def TransformOutputIterator(it, op): return make_transform_iterator(it, op, "output") +@annotate() def PermutationIterator(values, indices): """Returns an Iterator that accesses values through an index mapping. @@ -219,6 +228,7 @@ def PermutationIterator(values, indices): return make_permutation_iterator(values, indices) +@annotate() def ZipIterator(*iterators): """Returns an Iterator representing a zipped sequence of values from N iterators. diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index a401fbe4bcd..75ea89eedd0 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "cuda-pathfinder>=1.2.3", "cuda-core", "typing_extensions", + "nvtx", ] dynamic = ["version"] @@ -108,6 +109,7 @@ module = [ "cuda.bindings.*", "cuda.core.*", "cuda.pathfinder.*", + "nvtx", ] ignore_missing_imports = true follow_imports = "skip"