diff --git a/python/cuda_cccl/cuda/compute/_nvtx.py b/python/cuda_cccl/cuda/compute/_nvtx.py
new file mode 100644
index 00000000000..364af934b6b
--- /dev/null
+++ b/python/cuda_cccl/cuda/compute/_nvtx.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+NVTX annotation utilities for cuda.compute module.
+Uses NVIDIA green (76B900) color and cuda.compute domain.
+"""
+
+import functools
+
+import nvtx
+
+# NVIDIA green color hex value (76B900)
+NVIDIA_GREEN = 0x76B900
+
+# Domain name for cuda.compute annotations
+COMPUTE_DOMAIN = "cuda.compute"
+
+
+def annotate(message=None, domain=None, category=None, color=None):
+    """
+    Decorator to annotate functions with NVTX markers.
+
+    Args:
+        message: Optional message to display. If None, uses the function name.
+        domain: Optional NVTX domain string. Defaults to "cuda.compute".
+        category: Optional category for the annotation.
+        color: Optional color in hexadecimal format (0xRRGGBB). Defaults to NVIDIA green (0x76B900).
+
+    Returns:
+        Decorated function with NVTX annotations.
+    """
+
+    def decorator(func):
+        # Use function name if no message is provided
+        annotation_message = message if message is not None else func.__name__
+        annotation_domain = domain if domain is not None else COMPUTE_DOMAIN
+        annotation_color = color if color is not None else NVIDIA_GREEN
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            with nvtx.annotate(
+                annotation_message,
+                domain=annotation_domain,
+                color=annotation_color,
+                category=category,
+            ):
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_histogram.py b/python/cuda_cccl/cuda/compute/algorithms/_histogram.py
index 9e15d3c3ff0..563fd30d418 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_histogram.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_histogram.py
@@ -11,6 +11,7 @@
 from .. import _cccl_interop as cccl
 from .._caching import cache_with_key
 from .._cccl_interop import call_build, set_cccl_iterator_state, to_cccl_value_state
+from .._nvtx import annotate
 from .._utils.protocols import get_data_pointer, get_dtype, validate_and_get_stream
 from .._utils.temp_storage_buffer import TempStorageBuffer
 from ..iterators._iterators import IteratorBase
@@ -90,6 +91,7 @@ def __init__(
             is_evenly_segmented,
         )
 
+    @annotate(message="_Histogram.__call__")
     def __call__(
         self,
         temp_storage,
@@ -134,6 +136,7 @@ def __call__(
         return temp_storage_bytes
 
 
+@annotate()
 @cache_with_key(make_cache_key)
 def make_histogram_even(
     d_samples: DeviceArrayLike | IteratorBase,
@@ -173,6 +176,7 @@ def make_histogram_even(
     )
 
 
+@annotate()
 def histogram_even(
     d_samples: DeviceArrayLike | IteratorBase,
     d_histogram: DeviceArrayLike,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_reduce.py b/python/cuda_cccl/cuda/compute/algorithms/_reduce.py
index d527a40a4e9..859fd339ff8 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_reduce.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_reduce.py
@@ -16,6 +16,7 @@
     set_cccl_iterator_state,
     to_cccl_value_state,
 )
+from .._nvtx import annotate
 from .._utils import protocols
 from .._utils.protocols import get_data_pointer, validate_and_get_stream
 from .._utils.temp_storage_buffer import TempStorageBuffer
@@ -140,6 +141,7 @@ def _make_reduce_into_cached(
 
 # TODO Figure out `sum` without operator and initial value
 # TODO Accept stream
+@annotate()
 def make_reduce_into(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
@@ -170,6 +172,7 @@ def make_reduce_into(
     return _make_reduce_into_cached(d_in, d_out, op_adapter, h_init, **kwargs)
 
 
+@annotate()
 def reduce_into(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_scan.py b/python/cuda_cccl/cuda/compute/algorithms/_scan.py
index ef9861aca22..c71edb0eb4c 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_scan.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_scan.py
@@ -17,6 +17,7 @@
     set_cccl_iterator_state,
     to_cccl_value_state,
 )
+from .._nvtx import annotate
 from .._utils import protocols
 from .._utils.protocols import get_data_pointer, validate_and_get_stream
 from .._utils.temp_storage_buffer import TempStorageBuffer
@@ -114,6 +115,7 @@ def __init__(
             case (False, _bindings.InitKind.NO_INIT):
                 raise ValueError("Exclusive scan with No init value is not supported")
 
+    @annotate(message="_Scan.__call__")
     def __call__(
         self,
         temp_storage,
@@ -214,6 +216,7 @@ def _make_inclusive_scan_cached(
 
 # TODO Figure out `sum` without operator and initial value
 # TODO Accept stream
+@annotate()
 def make_exclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
@@ -243,6 +246,7 @@ def make_exclusive_scan(
     return _make_exclusive_scan_cached(d_in, d_out, op_adapter, init_value)
 
 
+@annotate()
 def exclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
@@ -280,6 +284,7 @@ def exclusive_scan(
 
 # TODO Figure out `sum` without operator and initial value
 # TODO Accept stream
+@annotate()
 def make_inclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
@@ -309,6 +314,7 @@ def make_inclusive_scan(
     return _make_inclusive_scan_cached(d_in, d_out, op_adapter, init_value)
 
 
+@annotate()
 def inclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py b/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py
index ff50b294b69..eadc7bf00d1 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py
@@ -11,6 +11,7 @@
     set_cccl_iterator_state,
     to_cccl_value_state,
 )
+from .._nvtx import annotate
 from .._utils import protocols
 from .._utils.protocols import (
     get_data_pointer,
@@ -84,6 +85,7 @@ def __init__(
             self.h_init_cccl,
         )
 
+    @annotate(message="_SegmentedReduce.__call__")
     def __call__(
         self,
         temp_storage,
@@ -171,6 +173,7 @@ def _make_segmented_reduce_cached(
     return _SegmentedReduce(d_in, d_out, start_offsets_in, end_offsets_in, op, h_init)
 
 
+@annotate()
 def make_segmented_reduce(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
@@ -206,6 +209,7 @@ def make_segmented_reduce(
     )
 
 
+@annotate()
 def segmented_reduce(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_select.py b/python/cuda_cccl/cuda/compute/algorithms/_select.py
index 89d26a268bd..7c01f2427c0 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_select.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_select.py
@@ -6,6 +6,7 @@
 from typing import Callable
 
 from .._caching import cache_with_key
+from .._nvtx import annotate
 from .._utils import protocols
 from .._utils.temp_storage_buffer import TempStorageBuffer
 from ..iterators._factories import DiscardIterator
@@ -62,6 +63,7 @@ def _cccl_always_false(x):
             _cccl_always_false,  # select_second_part_op - always false
         )
 
+    @annotate(message="_Select.__call__")
     def __call__(
         self,
         temp_storage,
@@ -97,6 +99,7 @@ def _make_select_cached(
     return _Select(d_in, d_out, d_num_selected_out, cond)
 
 
+@annotate()
 def make_select(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
@@ -131,6 +134,7 @@ def make_select(
     return _make_select_cached(d_in, d_out, d_num_selected_out, cond_adapter)
 
 
+@annotate()
 def select(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py
index 84d5134c537..41f058dabc4 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py
@@ -11,6 +11,7 @@
 from ... import _cccl_interop as cccl
 from ..._caching import cache_with_key
 from ..._cccl_interop import call_build, set_cccl_iterator_state
+from ..._nvtx import annotate
 from ..._utils import protocols
 from ..._utils.protocols import (
     get_data_pointer,
@@ -103,6 +104,7 @@ def __init__(
             self.op_cccl,
         )
 
+    @annotate(message="_MergeSort.__call__")
     def __call__(
         self,
         temp_storage,
@@ -161,6 +163,7 @@ def _make_merge_sort_cached(
     return _MergeSort(d_in_keys, d_in_items, d_out_keys, d_out_items, op)
 
 
+@annotate()
 def make_merge_sort(
     d_in_keys: DeviceArrayLike | IteratorBase,
     d_in_items: DeviceArrayLike | IteratorBase | None,
@@ -194,6 +197,7 @@ def make_merge_sort(
     )
 
 
+@annotate()
 def merge_sort(
     d_in_keys: DeviceArrayLike | IteratorBase,
     d_in_items: DeviceArrayLike | IteratorBase | None,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py
index 1080143018a..66c292b890a 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py
@@ -7,6 +7,7 @@
 from ... import _cccl_interop as cccl
 from ..._caching import cache_with_key
 from ..._cccl_interop import call_build, set_cccl_iterator_state
+from ..._nvtx import annotate
 from ..._utils.protocols import (
     get_data_pointer,
     get_dtype,
@@ -94,6 +95,7 @@ def __init__(
             decomposer_return_type,
         )
 
+    @annotate(message="_RadixSort.__call__")
     def __call__(
         self,
         temp_storage,
@@ -164,6 +166,7 @@ def __call__(
         return temp_storage_bytes
 
 
+@annotate()
 @cache_with_key(make_cache_key)
 def make_radix_sort(
     d_in_keys: DeviceArrayLike | DoubleBuffer,
@@ -195,6 +198,7 @@ def make_radix_sort(
     return _RadixSort(d_in_keys, d_out_keys, d_in_values, d_out_values, order)
 
 
+@annotate()
 def radix_sort(
     d_in_keys: DeviceArrayLike | DoubleBuffer,
     d_out_keys: DeviceArrayLike | None,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py
index 2df9bcc3db7..d80a2532766 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py
@@ -8,6 +8,7 @@
 from ... import _cccl_interop as cccl
 from ..._caching import cache_with_key
 from ..._cccl_interop import call_build, set_cccl_iterator_state
+from ..._nvtx import annotate
 from ..._utils.protocols import (
     get_data_pointer,
     get_dtype,
@@ -66,6 +67,7 @@ def __init__(
             self.end_offsets_in_cccl,
         )
 
+    @annotate(message="_SegmentedSort.__call__")
     def __call__(
         self,
         temp_storage,
@@ -166,6 +168,7 @@ def make_cache_key(
     )
 
 
+@annotate()
 @cache_with_key(make_cache_key)
 def make_segmented_sort(
     d_in_keys: DeviceArrayLike | DoubleBuffer,
@@ -209,6 +212,7 @@ def make_segmented_sort(
     )
 
 
+@annotate()
 def segmented_sort(
     d_in_keys: DeviceArrayLike | DoubleBuffer,
     d_out_keys: DeviceArrayLike | None,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py b/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py
index 9727d6f4ce5..cdcf64eab94 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py
@@ -11,6 +11,7 @@
 from .. import _cccl_interop as cccl
 from .._caching import cache_with_key
 from .._cccl_interop import call_build, set_cccl_iterator_state
+from .._nvtx import annotate
 from .._utils import protocols
 from .._utils.temp_storage_buffer import TempStorageBuffer
 from ..iterators._iterators import IteratorBase
@@ -112,6 +113,7 @@ def __init__(
             self.select_second_part_op_cccl,
         )
 
+    @annotate(message="_ThreeWayPartition.__call__")
     def __call__(
         self,
         temp_storage,
@@ -176,6 +178,7 @@ def _make_three_way_partition_cached(
     )
 
 
+@annotate()
 def make_three_way_partition(
     d_in: DeviceArrayLike | IteratorBase,
     d_first_part_out: DeviceArrayLike | IteratorBase,
@@ -223,6 +226,7 @@ def make_three_way_partition(
     )
 
 
+@annotate()
 def three_way_partition(
     d_in: DeviceArrayLike | IteratorBase,
     d_first_part_out: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_transform.py b/python/cuda_cccl/cuda/compute/algorithms/_transform.py
index 82abe8a7fae..fae1264c7fd 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_transform.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_transform.py
@@ -9,6 +9,7 @@
 from .. import _cccl_interop as cccl
 from .._caching import cache_with_key
 from .._cccl_interop import set_cccl_iterator_state
+from .._nvtx import annotate
 from .._utils import protocols
 from ..iterators._iterators import IteratorBase
 from ..op import OpAdapter, OpKind, make_op_adapter
@@ -39,6 +40,7 @@ def __init__(
             self.op_cccl,
         )
 
+    @annotate(message="_UnaryTransform.__call__")
     def __call__(
         self,
         d_in,
@@ -95,6 +97,7 @@ def __init__(
             self.op_cccl,
         )
 
+    @annotate(message="_BinaryTransform.__call__")
     def __call__(
         self,
         d_in1,
@@ -172,6 +175,7 @@ def _make_binary_transform_cached(
     return _BinaryTransform(d_in1, d_in2, d_out, op)
 
 
+@annotate()
 def make_unary_transform(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
@@ -202,6 +206,7 @@ def make_unary_transform(
     return _make_unary_transform_cached(d_in, d_out, op_adapter)
 
 
+@annotate()
 def make_binary_transform(
     d_in1: DeviceArrayLike | IteratorBase,
     d_in2: DeviceArrayLike | IteratorBase,
@@ -234,6 +239,7 @@ def make_binary_transform(
     return _make_binary_transform_cached(d_in1, d_in2, d_out, op_adapter)
 
 
+@annotate()
 def unary_transform(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
@@ -276,6 +282,7 @@ def unary_transform(
     transformer(d_in, d_out, num_items, stream)
 
 
+@annotate()
 def binary_transform(
     d_in1: DeviceArrayLike | IteratorBase,
     d_in2: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py b/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py
index 8f9c014feb0..0de37825129 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py
@@ -11,6 +11,7 @@
 from .. import _cccl_interop as cccl
 from .._caching import cache_with_key
 from .._cccl_interop import call_build, set_cccl_iterator_state
+from .._nvtx import annotate
 from .._utils import protocols
 from .._utils.protocols import (
     get_data_pointer,
@@ -103,6 +104,7 @@ def __init__(
             self.op_cccl,
         )
 
+    @annotate(message="_UniqueByKey.__call__")
     def __call__(
         self,
         temp_storage,
@@ -160,6 +162,7 @@ def _make_unique_by_key_cached(
     )
 
 
+@annotate()
 def make_unique_by_key(
     d_in_keys: DeviceArrayLike | IteratorBase,
     d_in_items: DeviceArrayLike | IteratorBase,
@@ -200,6 +203,7 @@ def make_unique_by_key(
     )
 
 
+@annotate()
 def unique_by_key(
     d_in_keys: DeviceArrayLike | IteratorBase,
     d_in_items: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/iterators/_factories.py b/python/cuda_cccl/cuda/compute/iterators/_factories.py
index 5023e660600..d398498b1cc 100644
--- a/python/cuda_cccl/cuda/compute/iterators/_factories.py
+++ b/python/cuda_cccl/cuda/compute/iterators/_factories.py
@@ -1,5 +1,6 @@
 import numba
 
+from .._nvtx import annotate
 from ._iterators import (
     CacheModifiedPointer as _CacheModifiedPointer,
 )
@@ -20,6 +21,7 @@
 from ._zip_iterator import make_zip_iterator
 
 
+@annotate()
 def CacheModifiedInputIterator(device_array, modifier):
     """Random Access Cache Modified Iterator that wraps a native device pointer.
 
@@ -50,6 +52,7 @@ def CacheModifiedInputIterator(device_array, modifier):
     )
 
 
+@annotate()
 def ConstantIterator(value):
     """Returns an Iterator representing a sequence of constant values.
 
@@ -73,6 +76,7 @@ def ConstantIterator(value):
     return _ConstantIterator(value)
 
 
+@annotate()
 def CountingIterator(offset):
     """Returns an Iterator representing a sequence of incrementing values.
 
@@ -96,6 +100,7 @@ def CountingIterator(offset):
     return _CountingIterator(offset)
 
 
+@annotate()
 def DiscardIterator(reference_iterator=None):
     """Returns an Input or Output Iterator that discards all values written to it.
 
@@ -119,6 +124,7 @@ def DiscardIterator(reference_iterator=None):
     return _DiscardIterator(reference_iterator)
 
 
+@annotate()
 def ReverseIterator(sequence):
     """Returns an Iterator over an array or another iterator in reverse.
 
@@ -147,6 +153,7 @@ def ReverseIterator(sequence):
     return make_reverse_iterator(sequence)
 
 
+@annotate()
 def TransformIterator(it, op):
     """An iterator that applies a user-defined unary function to the elements of an underlying iterator as they are read.
 
@@ -169,6 +176,7 @@ def TransformIterator(it, op):
     return make_transform_iterator(it, op, "input")
 
 
+@annotate()
 def TransformOutputIterator(it, op):
     """An iterator that applies a user-defined unary function to values before writing them to an underlying iterator.
 
@@ -192,6 +200,7 @@ def TransformOutputIterator(it, op):
     return make_transform_iterator(it, op, "output")
 
 
+@annotate()
 def PermutationIterator(values, indices):
     """Returns an Iterator that accesses values through an index mapping.
 
@@ -219,6 +228,7 @@ def PermutationIterator(values, indices):
     return make_permutation_iterator(values, indices)
 
 
+@annotate()
 def ZipIterator(*iterators):
     """Returns an Iterator representing a zipped sequence of values from N iterators.
 
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index a401fbe4bcd..75ea89eedd0 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -32,6 +32,7 @@ dependencies = [
   "cuda-pathfinder>=1.2.3",
   "cuda-core",
   "typing_extensions",
+  "nvtx",
 ]
 
 dynamic = ["version"]
@@ -108,6 +109,7 @@ module = [
   "cuda.bindings.*",
   "cuda.core.*",
   "cuda.pathfinder.*",
+  "nvtx",
 ]
 ignore_missing_imports = true
 follow_imports = "skip"