/* * Copyright (C) 2011 Shinpei Kato * * Systems Research Lab, University of California at Santa Cruz * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #ifndef __CUDA_H__ #define __CUDA_H__ #ifdef __KERNEL__ #include #else #include #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ typedef struct CUarray_st * CUarray; typedef unsigned long long CUdeviceptr; typedef int CUdevice; typedef struct CUctx_st* CUcontext; typedef struct CUmod_st* CUmodule; typedef struct CUfunc_st* CUfunction; typedef struct CUtexref_st* CUtexref; typedef struct CUsurfref_st* CUsurfref; typedef struct CUevent_st* CUevent; typedef struct CUstream_st* CUstream; typedef struct CUgraphicsResource_st* CUgraphicsResource; /** * Context creation flags */ typedef enum CUctx_flags_enum { CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling \deprecated */ CU_CTX_SCHED_MASK = 0x07, CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */ CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ CU_CTX_FLAGS_MASK = 0x1f } CUctx_flags; /** * Event creation flags */ typedef enum CUevent_flags_enum { CU_EVENT_DEFAULT = 0x0, /**< Default event flag */ CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */ CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */ CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */ } CUevent_flags; /** * Array formats */ typedef enum CUarray_format_enum { CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ } CUarray_format; /** * 3D array descriptor */ typedef struct CUDA_ARRAY3D_DESCRIPTOR_st { size_t Depth; unsigned int Flags; CUarray_format Format; size_t Height; unsigned int NumChannels; size_t Width; } CUDA_ARRAY3D_DESCRIPTOR; /** * Array descriptor */ typedef struct CUDA_ARRAY_DESCRIPTOR_st { CUarray_format Format; size_t Height; unsigned int NumChannels; size_t Width; } CUDA_ARRAY_DESCRIPTOR; /** * Texture reference addressing modes */ typedef enum CUaddress_mode_enum { CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ } CUaddress_mode; /** * Texture reference filtering modes */ typedef enum CUfilter_mode_enum { CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ } CUfilter_mode; /** * Device properties */ typedef enum CUdevice_attribute_enum { CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Peak clock frequency in kilohertz */ CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */ CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */ CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */ CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */ CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures*/ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,/**< Maximum cubemap layered texture width/height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,/**< Maximum layers in a cubemap layered texture */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,/**< Maximum cubemap layered surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,/**< Maximum layers in a cubemap layered surface */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Maximum 1D linear texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72 /**< Maximum 2D linear texture pitch in bytes */ } CUdevice_attribute; /** * Legacy device properties */ typedef struct CUdevprop_st { int maxThreadsPerBlock; /**< Maximum number of threads per block */ int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ int sharedMemPerBlock; /**< Shared memory available per block in bytes */ int totalConstantMemory; /**< Constant memory available on device in bytes */ int SIMDWidth; /**< Warp size in threads */ int memPitch; /**< Maximum pitch in bytes allowed by memory copies */ int regsPerBlock; /**< 32-bit registers available per block */ int clockRate; /**< Clock frequency in kilohertz */ int textureAlign; /**< Alignment requirement for textures */ } CUdevprop; /** * Pointer information */ typedef enum CUpointer_attribute_enum { CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */ CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */ CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */ CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */ } CUpointer_attribute; /** * Function properties */ typedef enum CUfunction_attribute_enum { /** * The maximum number of threads per block, beyond which a launch of the * function would fail. This number depends on both the function and the * device on which the function is currently loaded. */ CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, /** * The size in bytes of statically-allocated shared memory required by * this function. This does not include dynamically-allocated shared * memory requested by the user at runtime. */ CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, /** * The size in bytes of user-allocated constant memory required by this * function. */ CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, /** * The size in bytes of local memory used by each thread of this function. */ CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, /** * The number of registers used by each thread of this function. */ CU_FUNC_ATTRIBUTE_NUM_REGS = 4, /** * The PTX virtual architecture version for which the function was * compiled. This value is the major PTX version * 10 + the minor PTX * version, so a PTX version 1.3 function would return the value 13. * Note that this may return the undefined value of 0 for cubins * compiled prior to CUDA 3.0. */ CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, /** * The binary architecture version for which the function was compiled. * This value is the major binary version * 10 + the minor binary version, * so a binary version 1.3 function would return the value 13. Note that * this will return a value of 10 for legacy cubins that do not have a * properly-encoded binary architecture version. */ CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, CU_FUNC_ATTRIBUTE_MAX } CUfunction_attribute; /** * Function cache configurations */ typedef enum CUfunc_cache_enum { CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */ CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */ CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */ CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */ } CUfunc_cache; /** * Memory types */ typedef enum CUmemorytype_enum { CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */ CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ } CUmemorytype; /** * 2D memory copy parameters */ typedef struct CUDA_MEMCPY2D_st { CUarray dstArray; CUdeviceptr dstDevice; void * dstHost; CUmemorytype dstMemoryType; size_t dstPitch; size_t dstXInBytes; size_t dstY; size_t Height; CUarray srcArray; CUdeviceptr srcDevice; const void * srcHost; CUmemorytype srcMemoryType; size_t srcPitch; size_t srcXInBytes; size_t srcY; size_t WidthInBytes; } CUDA_MEMCPY2D; /** * 3D memory copy parameters */ typedef struct CUDA_MEMCPY3D_st { size_t Depth; CUarray dstArray; CUdeviceptr dstDevice; size_t dstHeight; void * dstHost; size_t dstLOD; CUmemorytype dstMemoryType; size_t dstPitch; size_t dstXInBytes; size_t dstY; size_t dstZ; size_t Height; void * reserved0; void * reserved1; CUarray srcArray; CUdeviceptr srcDevice; size_t srcHeight; const void * srcHost; size_t srcLOD; CUmemorytype srcMemoryType; size_t srcPitch; size_t srcXInBytes; size_t srcY; size_t srcZ; size_t WidthInBytes; } CUDA_MEMCPY3D; /** * 3D memory cross-context copy parameters */ typedef struct CUDA_MEMCPY3D_PEER_st { size_t Depth; CUarray dstArray; CUcontext dstContext; CUdeviceptr dstDevice; size_t dstHeight; void * dstHost; size_t dstLOD; CUmemorytype dstMemoryType; size_t dstPitch; size_t dstXInBytes; size_t dstY; size_t dstZ; size_t Height; CUarray srcArray; CUcontext srcContext; CUdeviceptr srcDevice; size_t srcHeight; const void * srcHost; size_t srcLOD; CUmemorytype srcMemoryType; size_t srcPitch; size_t srcXInBytes; size_t srcY; size_t srcZ; size_t WidthInBytes; } CUDA_MEMCPY3D_PEER; /** * Compute Modes */ typedef enum CUcomputemode_enum { CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */ CU_COMPUTEMODE_EXCLUSIVE = 1, /**< Compute-exclusive-thread mode (Only one context used by a single thread can be present on this device at a time) */ CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */ CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */ } CUcomputemode; /** * Online compiler options */ typedef enum CUjit_option_enum { /** * Max number of registers that a thread may use.\n * Option type: unsigned int */ CU_JIT_MAX_REGISTERS = 0, /** * IN: Specifies minimum number of threads per block to target compilation * for\n * OUT: Returns the number of threads the compiler actually targeted. * This restricts the resource utilization fo the compiler (e.g. max * registers) such that a block with the given number of threads should be * able to launch based on register limitations. Note, this option does not * currently take into account any other resource limitations, such as * shared memory utilization.\n * Option type: unsigned int */ CU_JIT_THREADS_PER_BLOCK, /** * Returns a float value in the option of the wall clock time, in * milliseconds, spent creating the cubin\n * Option type: float */ CU_JIT_WALL_TIME, /** * Pointer to a buffer in which to print any log messsages from PTXAS * that are informational in nature (the buffer size is specified via * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n * Option type: char* */ CU_JIT_INFO_LOG_BUFFER, /** * IN: Log buffer size in bytes. Log messages will be capped at this size * (including null terminator)\n * OUT: Amount of log buffer filled with messages\n * Option type: unsigned int */ CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, /** * Pointer to a buffer in which to print any log messages from PTXAS that * reflect errors (the buffer size is specified via option * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n * Option type: char* */ CU_JIT_ERROR_LOG_BUFFER, /** * IN: Log buffer size in bytes. Log messages will be capped at this size * (including null terminator)\n * OUT: Amount of log buffer filled with messages\n * Option type: unsigned int */ CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, /** * Level of optimizations to apply to generated code (0 - 4), with 4 * being the default and highest level of optimizations.\n * Option type: unsigned int */ CU_JIT_OPTIMIZATION_LEVEL, /** * No option value required. Determines the target based on the current * attached context (default)\n * Option type: No option value needed */ CU_JIT_TARGET_FROM_CUCONTEXT, /** * Target is chosen based on supplied ::CUjit_target_enum.\n * Option type: unsigned int for enumerated type ::CUjit_target_enum */ CU_JIT_TARGET, /** * Specifies choice of fallback strategy if matching cubin is not found. * Choice is based on supplied ::CUjit_fallback_enum.\n * Option type: unsigned int for enumerated type ::CUjit_fallback_enum */ CU_JIT_FALLBACK_STRATEGY } CUjit_option; /** * Online compilation targets */ typedef enum CUjit_target_enum { CU_TARGET_COMPUTE_10 = 0, /**< Compute device class 1.0 */ CU_TARGET_COMPUTE_11, /**< Compute device class 1.1 */ CU_TARGET_COMPUTE_12, /**< Compute device class 1.2 */ CU_TARGET_COMPUTE_13, /**< Compute device class 1.3 */ CU_TARGET_COMPUTE_20, /**< Compute device class 2.0 */ CU_TARGET_COMPUTE_21, /**< Compute device class 2.1 */ CU_TARGET_COMPUTE_30 /**< Compute device class 3.0 */ } CUjit_target; /** * Cubin matching fallback strategies */ typedef enum CUjit_fallback_enum { CU_PREFER_PTX = 0, /**< Prefer to compile ptx */ CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code */ } CUjit_fallback; /** * Flags to register a graphics resource */ typedef enum CUgraphicsRegisterFlags_enum { CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04 } CUgraphicsRegisterFlags; /** * Flags for mapping and unmapping interop resources */ typedef enum CUgraphicsMapResourceFlags_enum { CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 } CUgraphicsMapResourceFlags; /** * Array indices for cube faces */ typedef enum CUarray_cubemap_face_enum { CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */ CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */ CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */ CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */ CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */ CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */ } CUarray_cubemap_face; /** * Limits */ typedef enum CUlimit_enum { CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */ CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */ CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 /**< GPU malloc heap size */ } CUlimit; /** * Interprocess Handles */ typedef enum CUipcMem_flags_enum { CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /** < Automatically enable peer access between remote devices as needed */ } CUipcMem_flags; /** * Error codes */ typedef enum cudaError_enum { /** * The API call returned with no errors. In the case of query calls, this * can also mean that the operation being queried is complete (see * ::cuEventQuery() and ::cuStreamQuery()). */ CUDA_SUCCESS = 0, /** * This indicates that one or more of the parameters passed to the API call * is not within an acceptable range of values. */ CUDA_ERROR_INVALID_VALUE = 1, /** * The API call failed because it was unable to allocate enough memory to * perform the requested operation. */ CUDA_ERROR_OUT_OF_MEMORY = 2, /** * This indicates that the CUDA driver has not been initialized with * ::cuInit() or that initialization has failed. */ CUDA_ERROR_NOT_INITIALIZED = 3, /** * This indicates that the CUDA driver is in the process of shutting down. */ CUDA_ERROR_DEINITIALIZED = 4, /** * This indicates profiling APIs are called while application is running * in visual profiler mode. */ CUDA_ERROR_PROFILER_DISABLED = 5, /** * This indicates profiling has not been initialized for this context. * Call cuProfilerInitialize() to resolve this. */ CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, /** * This indicates profiler has already been started and probably * cuProfilerStart() is incorrectly called. */ CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, /** * This indicates profiler has already been stopped and probably * cuProfilerStop() is incorrectly called. */ CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, /** * This indicates that no CUDA-capable devices were detected by the installed * CUDA driver. */ CUDA_ERROR_NO_DEVICE = 100, /** * This indicates that the device ordinal supplied by the user does not * correspond to a valid CUDA device. */ CUDA_ERROR_INVALID_DEVICE = 101, /** * This indicates that the device kernel image is invalid. This can also * indicate an invalid CUDA module. */ CUDA_ERROR_INVALID_IMAGE = 200, /** * This most frequently indicates that there is no context bound to the * current thread. This can also be returned if the context passed to an * API call is not a valid handle (such as a context that has had * ::cuCtxDestroy() invoked on it). This can also be returned if a user * mixes different API versions (i.e. 3010 context with 3020 API calls). * See ::cuCtxGetApiVersion() for more details. */ CUDA_ERROR_INVALID_CONTEXT = 201, /** * This indicated that the context being supplied as a parameter to the * API call was already the active context. * \deprecated * This error return is deprecated as of CUDA 3.2. It is no longer an * error to attempt to push the active context via ::cuCtxPushCurrent(). */ CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, /** * This indicates that a map or register operation has failed. */ CUDA_ERROR_MAP_FAILED = 205, /** * This indicates that an unmap or unregister operation has failed. */ CUDA_ERROR_UNMAP_FAILED = 206, /** * This indicates that the specified array is currently mapped and thus * cannot be destroyed. */ CUDA_ERROR_ARRAY_IS_MAPPED = 207, /** * This indicates that the resource is already mapped. */ CUDA_ERROR_ALREADY_MAPPED = 208, /** * This indicates that there is no kernel image available that is suitable * for the device. This can occur when a user specifies code generation * options for a particular CUDA source file that do not include the * corresponding device configuration. */ CUDA_ERROR_NO_BINARY_FOR_GPU = 209, /** * This indicates that a resource has already been acquired. */ CUDA_ERROR_ALREADY_ACQUIRED = 210, /** * This indicates that a resource is not mapped. */ CUDA_ERROR_NOT_MAPPED = 211, /** * This indicates that a mapped resource is not available for access as an * array. */ CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, /** * This indicates that a mapped resource is not available for access as a * pointer. */ CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, /** * This indicates that an uncorrectable ECC error was detected during * execution. */ CUDA_ERROR_ECC_UNCORRECTABLE = 214, /** * This indicates that the ::CUlimit passed to the API call is not * supported by the active device. */ CUDA_ERROR_UNSUPPORTED_LIMIT = 215, /** * This indicates that the ::CUcontext passed to the API call can * only be bound to a single CPU thread at a time but is already * bound to a CPU thread. */ CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, /** * This indicates that the device kernel source is invalid. */ CUDA_ERROR_INVALID_SOURCE = 300, /** * This indicates that the file specified was not found. */ CUDA_ERROR_FILE_NOT_FOUND = 301, /** * This indicates that a link to a shared object failed to resolve. */ CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, /** * This indicates that initialization of a shared object failed. */ CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, /** * This indicates that an OS call failed. */ CUDA_ERROR_OPERATING_SYSTEM = 304, /** * This indicates that a resource handle passed to the API call was not * valid. Resource handles are opaque types like ::CUstream and ::CUevent. */ CUDA_ERROR_INVALID_HANDLE = 400, /** * This indicates that a named symbol was not found. Examples of symbols * are global/constant variable names, texture names, and surface names. */ CUDA_ERROR_NOT_FOUND = 500, /** * This indicates that asynchronous operations issued previously have not * completed yet. This result is not actually an error, but must be indicated * differently than ::CUDA_SUCCESS (which indicates completion). Calls that * may return this value include ::cuEventQuery() and ::cuStreamQuery(). */ CUDA_ERROR_NOT_READY = 600, /** * An exception occurred on the device while executing a kernel. Common * causes include dereferencing an invalid device pointer and accessing * out of bounds shared memory. The context cannot be used, so it must * be destroyed (and a new one should be created). All existing device * memory allocations from this context are invalid and must be * reconstructed if the program is to continue using CUDA. */ CUDA_ERROR_LAUNCH_FAILED = 700, /** * This indicates that a launch did not occur because it did not have * appropriate resources. This error usually indicates that the user has * attempted to pass too many arguments to the device kernel, or the * kernel launch specifies too many threads for the kernel's register * count. Passing arguments of the wrong size (i.e. a 64-bit pointer * when a 32-bit int is expected) is equivalent to passing too many * arguments and can also result in this error. */ CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, /** * This indicates that the device kernel took too long to execute. This can * only occur if timeouts are enabled - see the device attribute * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The * context cannot be used (and must be destroyed similar to * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from * this context are invalid and must be reconstructed if the program is to * continue using CUDA. */ CUDA_ERROR_LAUNCH_TIMEOUT = 702, /** * This error indicates a kernel launch that uses an incompatible texturing * mode. */ CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, /** * This error indicates that a call to ::cuCtxEnablePeerAccess() is * trying to re-enable peer access to a context which has already * had peer access to it enabled. */ CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, /** * This error indicates that ::cuCtxDisablePeerAccess() is * trying to disable peer access which has not been enabled yet * via ::cuCtxEnablePeerAccess(). */ CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, /** * This error indicates that the primary context for the specified device * has already been initialized. */ CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, /** * This error indicates that the context current to the calling thread * has been destroyed using ::cuCtxDestroy, or is a primary context which * has not yet been initialized. */ CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, /** * A device-side assert triggered during kernel execution. * The context cannot be used anymore, and must be destroyed. * All existing device memory allocations from this context are invalid * and must be reconstructed if the program is to continue using CUDA. */ CUDA_ERROR_ASSERT = 710, /** * This error indicates that the hardware resources required to enable * peer access have been exhausted for one or more of the devices passed * to cuCtxEnablePeerAccess(). */ CUDA_ERROR_TOO_MANY_PEERS = 711, /** * This error indicates that the memory range passed to cuMemHostRegister() * has already been registered. */ CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, /** * This error indicates that the pointer passed to cuMemHostUnregister() * does not correspond to any currently registered memory region. */ CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, /** * This indicates that an unknown internal error has occurred. */ CUDA_ERROR_UNKNOWN = 999 } CUresult; /** * Interprocess Handles */ #define CU_IPC_HANDLE_SIZE 64 /** * End of array terminator for the extra parameter to cuLaunchKernel */ #define CU_LAUNCH_PARAM_END ((void*)0x00) /** * Indicator that the next value in the extra parameter to cuLaunchKernel * will be a pointer to a buffer containing all kernel parameters used for * launching kernel f. This buffer needs to honor all alignment/padding * requirements of the individual parameters. * If CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the extra array, * then CU_LAUNCH_PARAM_BUFFER_POINTER will have no effect. */ #define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01) /** * Indicator that the next value in the extra parameter to cuLaunchKernel * will be a pointer to a size_t which contains the size of the buffer * specified with CU_LAUNCH_PARAM_BUFFER_POINTER. * It is required that CU_LAUNCH_PARAM_BUFFER_POINTER also be specified in * the extra array if the value associated with CU_LAUNCH_PARAM_BUFFER_SIZE * is not zero. */ #define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02) /** * If set, host memory is portable between CUDA contexts. * Flag for ::cuMemHostAlloc() */ #define CU_MEMHOSTALLOC_PORTABLE 0x01 /** * If set, host memory is mapped into CUDA address space and * ::cuMemHostGetDevicePointer() may be called on the host pointer. * Flag for ::cuMemHostAlloc() */ #define CU_MEMHOSTALLOC_DEVICEMAP 0x02 /** * If set, host memory is allocated as write-combined - fast to write, * faster to DMA, slow to read except via SSE4 streaming load instruction * (MOVNTDQA). * Flag for ::cuMemHostAlloc() */ #define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 /** * If set, host memory is portable between CUDA contexts. * Flag for cuMemHostRegister() */ #define CU_MEMHOSTREGISTER_PORTABLE 0x01 /** * If set, host memory is mapped into CUDA address space and * cuMemHostGetDevicePointer() may be called on the host pointer. * Flag for cuMemHostRegister() */ #define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 /** * For texture references loaded into the module, use default texunit from * texture reference. */ #define CU_PARAM_TR_DEFAULT -1 /** * Override the texref format with a format inferred from the array. * Flag for cuTexRefSetArray() */ #define CU_TRSA_OVERRIDE_FORMAT 0x01 /** * Read the texture as integers rather than promoting the values to floats * in the range [0,1]. * Flag for cuTexRefSetFlags() */ #define CU_TRSF_READ_AS_INTEGER 0x01 /** * Use normalized texture coordinates in the range [0,1) instead of [0,dim). * Flag for cuTexRefSetFlags() */ #define CU_TRSF_NORMALIZED_COORDINATES 0x02 /** * Perform sRGB->linear conversion during texture read. * Flag for cuTexRefSetFlags() */ #define CU_TRSF_SRGB 0x10 /** * If set, the CUDA array is a collection of layers, where each layer is * either a 1D or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR * specifies the number of layers, not the depth of a 3D array. */ #define CUDA_ARRAY3D_LAYERED 0x01 /** * Deprecated, use CUDA_ARRAY3D_LAYERED */ #define CUDA_ARRAY3D_2DARRAY 0x01 /** * This flag must be set in order to bind a surface reference to the CUDA * array */ #define CUDA_ARRAY3D_SURFACE_LDST 0x02 /** * If set, the CUDA array is a collection of six 2D arrays, representing * faces of a cube. The width of such a CUDA array must be equal to its * height, and Depth must be six. If CUDA_ARRAY3D_LAYERED flag is also set, * then the CUDA array is a collection of cubemaps and Depth must be a * multiple of six. */ #define CUDA_ARRAY3D_CUBEMAP 0x04 /** * This flag must be set in order to perform texture gather operations on * a CUDA array. */ #define CUDA_ARRAY3D_TEXTURE_GATHER 0x08 /** * CUDA API version number */ #define CUDA_VERSION 4020 /* Initialization */ CUresult cuInit(unsigned int Flags); /* Device Management */ CUresult cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); CUresult cuDeviceGet(CUdevice *device, int ordinal); CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); CUresult cuDeviceGetCount(int *count); CUresult cuDeviceGetName(char *name, int len, CUdevice dev); CUresult cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); CUresult cuDeviceTotalMem(size_t *bytes, CUdevice dev); /* Event Management */ CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags); CUresult cuEventDestroy(CUevent hEvent); CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); CUresult cuEventQuery(CUevent hEvent); CUresult cuEventRecord(CUevent hEvent, CUstream hStream); CUresult cuEventSynchronize(CUevent hEvent); /* Version Management */ CUresult cuDriverGetVersion (int *driverVersion); /* Context Management */ CUresult cuCtxAttach(CUcontext *pctx, unsigned int flags); CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); CUresult cuCtxDestroy(CUcontext ctx); CUresult cuCtxDetach(CUcontext ctx); CUresult cuCtxGetApiVersion(CUcontext ctx, unsigned int *version); CUresult cuCtxGetCacheConfig(CUfunc_cache *pconfig); CUresult cuCtxGetCurrent(CUcontext *pctx); CUresult cuCtxGetDevice(CUdevice *device); CUresult cuCtxGetLimit(size_t *pvalue, CUlimit limit); CUresult cuCtxPopCurrent(CUcontext *pctx); CUresult cuCtxPushCurrent(CUcontext ctx); CUresult cuCtxSetCacheConfig(CUfunc_cache config); CUresult cuCtxSetCurrent(CUcontext ctx); CUresult cuCtxSetLimit(CUlimit limit, size_t value); CUresult cuCtxSynchronize(void); /* Module Management */ CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); CUresult cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name); CUresult cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); CUresult cuModuleLoad(CUmodule *module, const char *fname); CUresult cuModuleLoadData(CUmodule *module, const void *image); CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); CUresult cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); CUresult cuModuleUnload(CUmodule hmod); /* Execution Control */ CUresult cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); CUresult cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); CUresult cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); CUresult cuLaunch(CUfunction f); CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height); CUresult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); CUresult cuParamSetf(CUfunction hfunc, int offset, float value); CUresult cuParamSeti(CUfunction hfunc, int offset, unsigned int value); CUresult cuParamSetSize(CUfunction hfunc, unsigned int numbytes); CUresult cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); CUresult cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); /* Memory Management (Incomplete) */ CUresult cuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize); CUresult cuMemFree(CUdeviceptr dptr); CUresult cuMemAllocHost(void **pp, unsigned int bytesize); CUresult cuMemFreeHost(void *p); CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount); CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream); CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount); CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); CUresult cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount); CUresult cuMemHostAlloc(void **pp, unsigned int bytesize, unsigned int Flags); CUresult cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); /* Memory mapping - Gdev extension */ CUresult cuMemMap(void **buf, CUdeviceptr dptr, unsigned int bytesize); CUresult cuMemUnmap(void *buf); /* Memory mapped address - Gdev extension */ CUresult cuMemGetPhysAddr(unsigned long long *addr, void *p); /* Stream Management */ CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags); CUresult cuStreamDestroy(CUstream hStream); CUresult cuStreamQuery(CUstream hStream); CUresult cuStreamSynchronize(CUstream hStream); CUresult cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); /* Inter-Process Communication (IPC) - Gdev extension */ CUresult cuShmGet(int *ptr, int key, size_t size, int flags); CUresult cuShmAt(CUdeviceptr *dptr, int id, int flags); CUresult cuShmDt(CUdeviceptr dptr); CUresult cuShmCtl(int id, int cmd, void *buf /* FIXME */); #ifdef __cplusplus } #endif /* __cplusplus */ #endif