-
Notifications
You must be signed in to change notification settings - Fork 0
/
cuda.h
14503 lines (13862 loc) · 564 KB
/
cuda.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef __cuda_cuda_h__
#define __cuda_cuda_h__
#include <stdlib.h>
#ifdef _MSC_VER
typedef unsigned __int32 cuuint32_t;
typedef unsigned __int64 cuuint64_t;
#else
#include <stdint.h>
typedef uint32_t cuuint32_t;
typedef uint64_t cuuint64_t;
#endif
/**
* CUDA API versioning support
*/
#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
#define __CUDA_DEPRECATED
#elif defined(_MSC_VER)
#define __CUDA_DEPRECATED __declspec(deprecated)
#elif defined(__GNUC__)
#define __CUDA_DEPRECATED __attribute__((deprecated))
#else
#define __CUDA_DEPRECATED
#endif
#if defined(CUDA_FORCE_API_VERSION)
#if (CUDA_FORCE_API_VERSION == 3010)
#define __CUDA_API_VERSION 3010
#else
#error "Unsupported value of CUDA_FORCE_API_VERSION"
#endif
#else
#define __CUDA_API_VERSION 10000
#endif /* CUDA_FORCE_API_VERSION */
#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
#define __CUDA_API_PER_THREAD_DEFAULT_STREAM
#define __CUDA_API_PTDS(api) api ## _ptds
#define __CUDA_API_PTSZ(api) api ## _ptsz
#else
#define __CUDA_API_PTDS(api) api
#define __CUDA_API_PTSZ(api) api
#endif
#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 3020
#define cuDeviceTotalMem cuDeviceTotalMem_v2
#define cuCtxCreate cuCtxCreate_v2
#define cuModuleGetGlobal cuModuleGetGlobal_v2
#define cuMemGetInfo cuMemGetInfo_v2
#define cuMemAlloc cuMemAlloc_v2
#define cuMemAllocPitch cuMemAllocPitch_v2
#define cuMemFree cuMemFree_v2
#define cuMemGetAddressRange cuMemGetAddressRange_v2
#define cuMemAllocHost cuMemAllocHost_v2
#define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2
#define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2)
#define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2)
#define cuMemcpyDtoD __CUDA_API_PTDS(cuMemcpyDtoD_v2)
#define cuMemcpyDtoA __CUDA_API_PTDS(cuMemcpyDtoA_v2)
#define cuMemcpyAtoD __CUDA_API_PTDS(cuMemcpyAtoD_v2)
#define cuMemcpyHtoA __CUDA_API_PTDS(cuMemcpyHtoA_v2)
#define cuMemcpyAtoH __CUDA_API_PTDS(cuMemcpyAtoH_v2)
#define cuMemcpyAtoA __CUDA_API_PTDS(cuMemcpyAtoA_v2)
#define cuMemcpyHtoAAsync __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2)
#define cuMemcpyAtoHAsync __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2)
#define cuMemcpy2D __CUDA_API_PTDS(cuMemcpy2D_v2)
#define cuMemcpy2DUnaligned __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2)
#define cuMemcpy3D __CUDA_API_PTDS(cuMemcpy3D_v2)
#define cuMemcpyHtoDAsync __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2)
#define cuMemcpyDtoHAsync __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2)
#define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2)
#define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2)
#define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2)
#define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2)
#define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2)
#define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2)
#define cuMemsetD2D8 __CUDA_API_PTDS(cuMemsetD2D8_v2)
#define cuMemsetD2D16 __CUDA_API_PTDS(cuMemsetD2D16_v2)
#define cuMemsetD2D32 __CUDA_API_PTDS(cuMemsetD2D32_v2)
#define cuArrayCreate cuArrayCreate_v2
#define cuArrayGetDescriptor cuArrayGetDescriptor_v2
#define cuArray3DCreate cuArray3DCreate_v2
#define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2
#define cuTexRefSetAddress cuTexRefSetAddress_v2
#define cuTexRefGetAddress cuTexRefGetAddress_v2
#define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2
#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 3020 */
#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4000
#define cuCtxDestroy cuCtxDestroy_v2
#define cuCtxPopCurrent cuCtxPopCurrent_v2
#define cuCtxPushCurrent cuCtxPushCurrent_v2
#define cuStreamDestroy cuStreamDestroy_v2
#define cuEventDestroy cuEventDestroy_v2
#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4000 */
#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4010
#define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3
#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4010 */
#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050
#define cuLinkCreate cuLinkCreate_v2
#define cuLinkAddData cuLinkAddData_v2
#define cuLinkAddFile cuLinkAddFile_v2
#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */
#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050
#define cuMemHostRegister cuMemHostRegister_v2
#define cuGraphicsResourceSetMapFlags cuGraphicsResourceSetMapFlags_v2
#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */
#if !defined(__CUDA_API_VERSION_INTERNAL)
#if defined(__CUDA_API_VERSION) && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010
#define cuTexRefSetAddress2D cuTexRefSetAddress2D_v2
#endif /* __CUDA_API_VERSION && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 */
#endif /* __CUDA_API_VERSION_INTERNAL */
#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
#define cuMemcpy __CUDA_API_PTDS(cuMemcpy)
#define cuMemcpyAsync __CUDA_API_PTSZ(cuMemcpyAsync)
#define cuMemcpyPeer __CUDA_API_PTDS(cuMemcpyPeer)
#define cuMemcpyPeerAsync __CUDA_API_PTSZ(cuMemcpyPeerAsync)
#define cuMemcpy3DPeer __CUDA_API_PTDS(cuMemcpy3DPeer)
#define cuMemcpy3DPeerAsync __CUDA_API_PTSZ(cuMemcpy3DPeerAsync)
#define cuMemPrefetchAsync __CUDA_API_PTSZ(cuMemPrefetchAsync)
#define cuMemsetD8Async __CUDA_API_PTSZ(cuMemsetD8Async)
#define cuMemsetD16Async __CUDA_API_PTSZ(cuMemsetD16Async)
#define cuMemsetD32Async __CUDA_API_PTSZ(cuMemsetD32Async)
#define cuMemsetD2D8Async __CUDA_API_PTSZ(cuMemsetD2D8Async)
#define cuMemsetD2D16Async __CUDA_API_PTSZ(cuMemsetD2D16Async)
#define cuMemsetD2D32Async __CUDA_API_PTSZ(cuMemsetD2D32Async)
#define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority)
#define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags)
#define cuStreamGetCtx __CUDA_API_PTSZ(cuStreamGetCtx)
#define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent)
#define cuStreamBeginCapture __CUDA_API_PTSZ(cuStreamBeginCapture)
#define cuStreamEndCapture __CUDA_API_PTSZ(cuStreamEndCapture)
#define cuStreamIsCapturing __CUDA_API_PTSZ(cuStreamIsCapturing)
#define cuStreamAddCallback __CUDA_API_PTSZ(cuStreamAddCallback)
#define cuStreamAttachMemAsync __CUDA_API_PTSZ(cuStreamAttachMemAsync)
#define cuStreamQuery __CUDA_API_PTSZ(cuStreamQuery)
#define cuStreamSynchronize __CUDA_API_PTSZ(cuStreamSynchronize)
#define cuEventRecord __CUDA_API_PTSZ(cuEventRecord)
#define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel)
#define cuLaunchHostFunc __CUDA_API_PTSZ(cuLaunchHostFunc)
#define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources)
#define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources)
#define cuStreamWriteValue32 __CUDA_API_PTSZ(cuStreamWriteValue32)
#define cuStreamWaitValue32 __CUDA_API_PTSZ(cuStreamWaitValue32)
#define cuStreamWriteValue64 __CUDA_API_PTSZ(cuStreamWriteValue64)
#define cuStreamWaitValue64 __CUDA_API_PTSZ(cuStreamWaitValue64)
#define cuStreamBatchMemOp __CUDA_API_PTSZ(cuStreamBatchMemOp)
#define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel)
#define cuSignalExternalSemaphoresAsync __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync)
#define cuWaitExternalSemaphoresAsync __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync)
#define cuGraphLaunch __CUDA_API_PTSZ(cuGraphLaunch)
#endif
/**
* \file cuda.h
* \brief Header file for the CUDA Toolkit application programming interface.
*
* \file cudaGL.h
* \brief Header file for the OpenGL interoperability functions of the
* low-level CUDA driver application programming interface.
*
* \file cudaD3D9.h
* \brief Header file for the Direct3D 9 interoperability functions of the
* low-level CUDA driver application programming interface.
*/
/**
* \defgroup CUDA_TYPES Data types used by CUDA driver
* @{
*/
/**
* CUDA API version number
*/
#define CUDA_VERSION 10000
#ifdef __cplusplus
extern "C" {
#endif
/**
* CUDA device pointer
* CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
*/
#if __CUDA_API_VERSION >= 3020
#if defined(_WIN64) || defined(__LP64__)
typedef unsigned long long CUdeviceptr;
#else
typedef unsigned int CUdeviceptr;
#endif
#endif /* __CUDA_API_VERSION >= 3020 */
typedef int CUdevice; /**< CUDA device */
typedef struct CUctx_st *CUcontext; /**< CUDA context */
typedef struct CUmod_st *CUmodule; /**< CUDA module */
typedef struct CUfunc_st *CUfunction; /**< CUDA function */
typedef struct CUarray_st *CUarray; /**< CUDA array */
typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */
typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */
typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */
typedef struct CUevent_st *CUevent; /**< CUDA event */
typedef struct CUstream_st *CUstream; /**< CUDA stream */
typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */
typedef unsigned long long CUtexObject; /**< An opaque value that represents a CUDA texture object */
typedef unsigned long long CUsurfObject; /**< An opaque value that represents a CUDA surface object */
typedef struct CUextMemory_st *CUexternalMemory; /**< CUDA external memory */
typedef struct CUextSemaphore_st *CUexternalSemaphore; /**< CUDA external semaphore */
typedef struct CUgraph_st *CUgraph; /**< CUDA graph */
typedef struct CUgraphNode_st *CUgraphNode; /**< CUDA graph node */
typedef struct CUgraphExec_st *CUgraphExec; /**< CUDA executable graph */
#ifndef CU_UUID_HAS_BEEN_DEFINED
#define CU_UUID_HAS_BEEN_DEFINED
typedef struct CUuuid_st { /**< CUDA definition of UUID */
char bytes[16];
} CUuuid;
#endif
#if __CUDA_API_VERSION >= 4010
/**
* CUDA IPC handle size
*/
#define CU_IPC_HANDLE_SIZE 64
/**
* CUDA IPC event handle
*/
typedef struct CUipcEventHandle_st {
char reserved[CU_IPC_HANDLE_SIZE];
} CUipcEventHandle;
/**
* CUDA IPC mem handle
*/
typedef struct CUipcMemHandle_st {
char reserved[CU_IPC_HANDLE_SIZE];
} CUipcMemHandle;
/**
* CUDA Ipc Mem Flags
*/
typedef enum CUipcMem_flags_enum {
CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
} CUipcMem_flags;
#endif
/**
* CUDA Mem Attach Flags
*/
typedef enum CUmemAttach_flags_enum {
CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */
CU_MEM_ATTACH_HOST = 0x2, /**< Memory cannot be accessed by any stream on any device */
CU_MEM_ATTACH_SINGLE = 0x4 /**< Memory can only be accessed by a single stream on the associated device */
} CUmemAttach_flags;
/**
* Context creation flags
*/
typedef enum CUctx_flags_enum {
CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */
CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling
* \deprecated This flag was deprecated as of CUDA 4.0
* and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
CU_CTX_SCHED_MASK = 0x07,
CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */
CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */
CU_CTX_FLAGS_MASK = 0x1f
} CUctx_flags;
/**
* Stream creation flags
*/
typedef enum CUstream_flags_enum {
CU_STREAM_DEFAULT = 0x0, /**< Default stream flag */
CU_STREAM_NON_BLOCKING = 0x1 /**< Stream does not synchronize with stream 0 (the NULL stream) */
} CUstream_flags;
/**
* Legacy stream handle
*
* Stream handle that can be passed as a CUstream to use an implicit stream
* with legacy synchronization behavior.
*
* See details of the \link_sync_behavior
*/
#define CU_STREAM_LEGACY ((CUstream)0x1)
/**
* Per-thread stream handle
*
* Stream handle that can be passed as a CUstream to use an implicit stream
* with per-thread synchronization behavior.
*
* See details of the \link_sync_behavior
*/
#define CU_STREAM_PER_THREAD ((CUstream)0x2)
/**
* Event creation flags
*/
typedef enum CUevent_flags_enum {
CU_EVENT_DEFAULT = 0x0, /**< Default event flag */
CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */
CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
} CUevent_flags;
#if __CUDA_API_VERSION >= 8000
/**
* Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64
*/
typedef enum CUstreamWaitValue_flags_enum {
CU_STREAM_WAIT_VALUE_GEQ = 0x0, /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit
values). Note this is a cyclic comparison which ignores wraparound.
(Default behavior.) */
CU_STREAM_WAIT_VALUE_EQ = 0x1, /**< Wait until *addr == value. */
CU_STREAM_WAIT_VALUE_AND = 0x2, /**< Wait until (*addr & value) != 0. */
CU_STREAM_WAIT_VALUE_NOR = 0x3, /**< Wait until ~(*addr | value) != 0. Support for this operation can be
queried with ::cuDeviceGetAttribute() and
::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/
CU_STREAM_WAIT_VALUE_FLUSH = 1<<30 /**< Follow the wait operation with a flush of outstanding remote writes. This
means that, if a remote write operation is guaranteed to have reached the
device before the wait can be satisfied, that write is guaranteed to be
visible to downstream device work. The device is permitted to reorder
remote writes internally. For example, this flag would be required if
two remote writes arrive in a defined order, the wait is satisfied by the
second write, and downstream work needs to observe the first write.
Support for this operation is restricted to selected platforms and can be
queried with ::CU_DEVICE_ATTRIBUTE_CAN_USE_WAIT_VALUE_FLUSH.*/
} CUstreamWaitValue_flags;
/**
* Flags for ::cuStreamWriteValue32
*/
typedef enum CUstreamWriteValue_flags_enum {
CU_STREAM_WRITE_VALUE_DEFAULT = 0x0, /**< Default behavior */
CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1 /**< Permits the write to be reordered with writes which were issued
before it, as a performance optimization. Normally,
::cuStreamWriteValue32 will provide a memory fence before the
write, which has similar semantics to
__threadfence_system() but is scoped to the stream
rather than a CUDA thread. */
} CUstreamWriteValue_flags;
/**
* Operations for ::cuStreamBatchMemOp
*/
typedef enum CUstreamBatchMemOpType_enum {
CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1, /**< Represents a ::cuStreamWaitValue32 operation */
CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2, /**< Represents a ::cuStreamWriteValue32 operation */
CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4, /**< Represents a ::cuStreamWaitValue64 operation */
CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5, /**< Represents a ::cuStreamWriteValue64 operation */
CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a
standalone operation. */
} CUstreamBatchMemOpType;
/**
* Per-operation parameters for ::cuStreamBatchMemOp
*/
typedef union CUstreamBatchMemOpParams_union {
CUstreamBatchMemOpType operation;
struct CUstreamMemOpWaitValueParams_st {
CUstreamBatchMemOpType operation;
CUdeviceptr address;
union {
cuuint32_t value;
cuuint64_t value64;
};
unsigned int flags;
CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
} waitValue;
struct CUstreamMemOpWriteValueParams_st {
CUstreamBatchMemOpType operation;
CUdeviceptr address;
union {
cuuint32_t value;
cuuint64_t value64;
};
unsigned int flags;
CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
} writeValue;
struct CUstreamMemOpFlushRemoteWritesParams_st {
CUstreamBatchMemOpType operation;
unsigned int flags;
} flushRemoteWrites;
cuuint64_t pad[6];
} CUstreamBatchMemOpParams;
#endif /* __CUDA_API_VERSION >= 8000 */
/**
* Occupancy calculator flag
*/
typedef enum CUoccupancy_flags_enum {
CU_OCCUPANCY_DEFAULT = 0x0, /**< Default behavior */
CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1 /**< Assume global caching is enabled and cannot be automatically turned off */
} CUoccupancy_flags;
/**
* Array formats
*/
typedef enum CUarray_format_enum {
CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */
CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */
CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */
CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */
CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */
} CUarray_format;
/**
* Texture reference addressing modes
*/
typedef enum CUaddress_mode_enum {
CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */
CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */
CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */
} CUaddress_mode;
/**
* Texture reference filtering modes
*/
typedef enum CUfilter_mode_enum {
CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */
CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */
} CUfilter_mode;
/**
* Device properties
*/
typedef enum CUdevice_attribute_enum {
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */
CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */
CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Typical clock frequency in kilohertz */
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */
CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */
CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */
CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */
CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Maximum 1D linear texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,/**< Maximum mipmapped 2D texture height */
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */
CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */
CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, /**< Device supports caching globals in L1 */
CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, /**< Device supports caching locals in L1 */
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */
CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */
CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, /**< Device can coherently access managed memory concurrently with the CPU */
CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, /**< Device supports compute preemption. */
CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, /**< ::cuStreamBatchMemOp and related APIs are supported. */
CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */
CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, /**< Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice */
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */
CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, /**< Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, /**< Device supports host memory registration via ::cudaHostRegister. */
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */
CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */
CU_DEVICE_ATTRIBUTE_MAX
} CUdevice_attribute;
/**
* Legacy device properties
*/
typedef struct CUdevprop_st {
int maxThreadsPerBlock; /**< Maximum number of threads per block */
int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */
int maxGridSize[3]; /**< Maximum size of each dimension of a grid */
int sharedMemPerBlock; /**< Shared memory available per block in bytes */
int totalConstantMemory; /**< Constant memory available on device in bytes */
int SIMDWidth; /**< Warp size in threads */
int memPitch; /**< Maximum pitch in bytes allowed by memory copies */
int regsPerBlock; /**< 32-bit registers available per block */
int clockRate; /**< Clock frequency in kilohertz */
int textureAlign; /**< Alignment requirement for textures */
} CUdevprop;
/**
* Pointer information
*/
typedef enum CUpointer_attribute_enum {
CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */
CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */
CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */
CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */
CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */
CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */
CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/
CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, /**< Indicates if the pointer points to managed memory */
CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9 /**< A device ordinal of a device on which a pointer was allocated or registered */
} CUpointer_attribute;
/**
* Function properties
*/
typedef enum CUfunction_attribute_enum {
/**
* The maximum number of threads per block, beyond which a launch of the
* function would fail. This number depends on both the function and the
* device on which the function is currently loaded.
*/
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
/**
* The size in bytes of statically-allocated shared memory required by
* this function. This does not include dynamically-allocated shared
* memory requested by the user at runtime.
*/
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
/**
* The size in bytes of user-allocated constant memory required by this
* function.
*/
CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
/**
* The size in bytes of local memory used by each thread of this function.
*/
CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
/**
* The number of registers used by each thread of this function.
*/
CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
/**
* The PTX virtual architecture version for which the function was
* compiled. This value is the major PTX version * 10 + the minor PTX
* version, so a PTX version 1.3 function would return the value 13.
* Note that this may return the undefined value of 0 for cubins
* compiled prior to CUDA 3.0.
*/
CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
/**
* The binary architecture version for which the function was compiled.
* This value is the major binary version * 10 + the minor binary version,
* so a binary version 1.3 function would return the value 13. Note that
* this will return a value of 10 for legacy cubins that do not have a
* properly-encoded binary architecture version.
*/
CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
/**
* The attribute to indicate whether the function has been compiled with
* user specified option "-Xptxas --dlcm=ca" set .
*/
CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
/**
* The maximum size in bytes of dynamically-allocated shared memory that can be used by
* this function. If the user-specified dynamic shared memory size is larger than this
* value, the launch will fail.
*/
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
/**
* On devices where the L1 cache and shared memory use the same hardware resources,
* this sets the shared memory carveout preference, in percent of the total resources.
* This is only a hint, and the driver can choose a different ratio if required to execute the function.
*/
CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
CU_FUNC_ATTRIBUTE_MAX
} CUfunction_attribute;
/**
* Function cache configurations
*/
typedef enum CUfunc_cache_enum {
CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */
CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */
CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */
CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */
} CUfunc_cache;
/**
* Shared memory configurations
*/
typedef enum CUsharedconfig_enum {
CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */
CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */
CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */
} CUsharedconfig;
/**
* Shared memory carveout configurations
*/
typedef enum CUshared_carveout_enum {
CU_SHAREDMEM_CARVEOUT_DEFAULT = -1, /** < no preference for shared memory or L1 (default) */
CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, /** < prefer maximum available shared memory, minimum L1 cache */
CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0 /** < prefer maximum available L1 cache, minimum shared memory */
} CUshared_carveout;
/**
* Memory types
*/
typedef enum CUmemorytype_enum {
CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */
CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */
CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */
CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */
} CUmemorytype;
/**
* Compute Modes
*/
typedef enum CUcomputemode_enum {
CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */
CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
} CUcomputemode;
/**
* Memory advise values
*/
typedef enum CUmem_advise_enum {
CU_MEM_ADVISE_SET_READ_MOSTLY = 1, /**< Data will mostly be read and only occassionally be written to */
CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */
CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3, /**< Set the preferred location for the data as the specified device */
CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */
CU_MEM_ADVISE_SET_ACCESSED_BY = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */
CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */
} CUmem_advise;
typedef enum CUmem_range_attribute_enum {
CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1, /**< Whether the range will mostly be read and only occassionally be written to */
CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2, /**< The preferred location of the range */
CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */
CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4 /**< The last location to which the range was prefetched */
} CUmem_range_attribute;
/**
* Online compiler and linker options
*/
typedef enum CUjit_option_enum
{
/**
* Max number of registers that a thread may use.\n
* Option type: unsigned int\n
* Applies to: compiler only
*/
CU_JIT_MAX_REGISTERS = 0,
/**
* IN: Specifies minimum number of threads per block to target compilation
* for\n
* OUT: Returns the number of threads the compiler actually targeted.
* This restricts the resource utilization fo the compiler (e.g. max
* registers) such that a block with the given number of threads should be
* able to launch based on register limitations. Note, this option does not
* currently take into account any other resource limitations, such as
* shared memory utilization.\n
* Cannot be combined with ::CU_JIT_TARGET.\n
* Option type: unsigned int\n
* Applies to: compiler only
*/
CU_JIT_THREADS_PER_BLOCK,
/**
* Overwrites the option value with the total wall clock time, in
* milliseconds, spent in the compiler and linker\n
* Option type: float\n
* Applies to: compiler and linker
*/
CU_JIT_WALL_TIME,
/**
* Pointer to a buffer in which to print any log messages
* that are informational in nature (the buffer size is specified via
* option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
* Option type: char *\n
* Applies to: compiler and linker
*/
CU_JIT_INFO_LOG_BUFFER,
/**
* IN: Log buffer size in bytes. Log messages will be capped at this size
* (including null terminator)\n
* OUT: Amount of log buffer filled with messages\n
* Option type: unsigned int\n
* Applies to: compiler and linker
*/
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
/**
* Pointer to a buffer in which to print any log messages that
* reflect errors (the buffer size is specified via option
* ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
* Option type: char *\n
* Applies to: compiler and linker
*/
CU_JIT_ERROR_LOG_BUFFER,
/**
* IN: Log buffer size in bytes. Log messages will be capped at this size
* (including null terminator)\n
* OUT: Amount of log buffer filled with messages\n
* Option type: unsigned int\n
* Applies to: compiler and linker
*/
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
/**
* Level of optimizations to apply to generated code (0 - 4), with 4
* being the default and highest level of optimizations.\n
* Option type: unsigned int\n
* Applies to: compiler only
*/
CU_JIT_OPTIMIZATION_LEVEL,
/**
* No option value required. Determines the target based on the current
* attached context (default)\n
* Option type: No option value needed\n
* Applies to: compiler and linker
*/
CU_JIT_TARGET_FROM_CUCONTEXT,
/**
* Target is chosen based on supplied ::CUjit_target. Cannot be
* combined with ::CU_JIT_THREADS_PER_BLOCK.\n
* Option type: unsigned int for enumerated type ::CUjit_target\n
* Applies to: compiler and linker
*/
CU_JIT_TARGET,
/**
* Specifies choice of fallback strategy if matching cubin is not found.
* Choice is based on supplied ::CUjit_fallback. This option cannot be
* used with cuLink* APIs as the linker requires exact matches.\n
* Option type: unsigned int for enumerated type ::CUjit_fallback\n
* Applies to: compiler only
*/
CU_JIT_FALLBACK_STRATEGY,
/**
* Specifies whether to create debug information in output (-g)
* (0: false, default)\n
* Option type: int\n
* Applies to: compiler and linker
*/
CU_JIT_GENERATE_DEBUG_INFO,
/**
* Generate verbose log messages (0: false, default)\n
* Option type: int\n
* Applies to: compiler and linker
*/
CU_JIT_LOG_VERBOSE,
/**
* Generate line number information (-lineinfo) (0: false, default)\n
* Option type: int\n
* Applies to: compiler only
*/
CU_JIT_GENERATE_LINE_INFO,
/**
* Specifies whether to enable caching explicitly (-dlcm) \n
* Choice is based on supplied ::CUjit_cacheMode_enum.\n
* Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n
* Applies to: compiler only
*/
CU_JIT_CACHE_MODE,
/**
* The below jit options are used for internal purposes only, in this version of CUDA
*/
CU_JIT_NEW_SM3X_OPT,
CU_JIT_FAST_COMPILE,
/**
* Array of device symbol names that will be relocated to the corresponing
* host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
* Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
* When loding a device module, driver will relocate all encountered
* unresolved symbols to the host addresses.\n
* It is only allowed to register symbols that correspond to unresolved
* global variables.\n
* It is illegal to register the same device symbol at multiple addresses.\n
* Option type: const char **\n
* Applies to: dynamic linker only
*/
CU_JIT_GLOBAL_SYMBOL_NAMES,
/**
* Array of host addresses that will be used to relocate corresponding
* device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n
* Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
* Option type: void **\n
* Applies to: dynamic linker only
*/
CU_JIT_GLOBAL_SYMBOL_ADDRESSES,
/**
* Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and
* ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n
* Option type: unsigned int\n
* Applies to: dynamic linker only
*/
CU_JIT_GLOBAL_SYMBOL_COUNT,
CU_JIT_NUM_OPTIONS
} CUjit_option;
/**
* Online compilation targets
*/
typedef enum CUjit_target_enum
{
CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */
CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */
CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */
CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */
CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */
CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */
CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */
CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */
CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */
CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/
CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/
CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/
CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/
CU_TARGET_COMPUTE_75 = 75 /**< Compute device class 7.5.*/
} CUjit_target;
/**
* Cubin matching fallback strategies
*/
typedef enum CUjit_fallback_enum
{
CU_PREFER_PTX = 0, /**< Prefer to compile ptx if exact binary match not found */
CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code if exact match not found */
} CUjit_fallback;
/**
* Caching modes for dlcm
*/
typedef enum CUjit_cacheMode_enum
{
CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */
CU_JIT_CACHE_OPTION_CG, /**< Compile with L1 cache disabled */
CU_JIT_CACHE_OPTION_CA /**< Compile with L1 cache enabled */
} CUjit_cacheMode;
/**
* Device code formats
*/
typedef enum CUjitInputType_enum