Provides useful macros and CUDA functions. More...
#include "MNUtilities.h"#include <cuda_runtime.h>#include <cutil_inline.h>#include <cutil_math.h>Go to the source code of this file.
Classes | |
| class | ReduceOperatorTraits< T, oper > |
| Reduction operator traits class. More... | |
Defines | |
| #define | MNCUDA_CHECKERROR mncudaSafeCallNoSync(mncudaCheckError(false)) |
| Performs a CUDA synchronization and checks for returned errors (optional in release mode). | |
| #define | mncudaSafeCallNoSync(err) __mncudaSafeCallNoSync(err, __FILE__, __LINE__) |
| CUDA error check macro. Comparable to cudaSafeCallNoSync, however using MNFatal(). | |
| #define | mncudaCheckErrorCUtil(err) __mncudaCheckErrorCUtil(err, __FILE__, __LINE__) |
| CUDA CUtil error check macro. Uses MNFatal(). | |
| #define | MNCUDA_DIVUP(count, chunkSize) ((count) / (chunkSize) + (((count) % (chunkSize))?1:0)) |
| Divides count by chunkSize and adds 1 if there is some remainder. | |
| #define | MNCUDA_MAKEGRID2D(numBlocks, maxBlocks) dim3(min((numBlocks), (maxBlocks)), 1 + (numBlocks) / (maxBlocks), 1) |
| Avoids the maximum CUDA grid size by using two grid dimensions for a one dimensional grid. I added this due to problems with exceeding 1D grid sizes. | |
| #define | MNCUDA_GRID2DINDEX (blockIdx.x + (blockIdx.y*gridDim.x)) |
| Calculates the one dimensional block index for the given 2D grid that was created by MNCUDA_MAKEGRID2D(). | |
| #define | MNCUDA_ALIGN_BYTES(size, alignment) |
| Computes the aligned byte size for a given alignment. | |
| #define | MNCUDA_ALIGN(count) MNCUDA_ALIGN_EX(count, 16) |
| Computes the aligned element count for an alignment of 16 elements. | |
| #define | MNCUDA_ALIGN_NZERO(count) MNCUDA_ALIGN_EX((((count) == 0) ? 1 : (count)), 16) |
| Computes the aligned element count with special treatment for zero counts. | |
| #define | MNCUDA_ALIGN_EX(count, alignment) |
| Computes the aligned element count, extended version. | |
Enumerations | |
| enum | MNCudaOP { MNCuda_ADD, MNCuda_SUB, MNCuda_MUL, MNCuda_DIV, MNCuda_MIN, MNCuda_MAX } |
Operator types used in utility algorithms. These operators are usually used as binary operators, e.g. when working component-wise on two arrays. More... | |
Functions | |
| void | __mncudaSafeCallNoSync (cudaError err, const char *file, const int line) |
| Checks for CUDA errors without synchronization. Uses MNFatal() to report the error. | |
| void | __mncudaCheckErrorCUtil (CUTBoolean err, const char *file, const int line) |
| Checks for CUDA CUtil errors without synchronization. Uses MNFatal() to report the error. | |
| cudaError_t | mncudaCheckError (bool bForce=true) |
| Checks for pending errors and returns them. | |
| void | mncudaEnableErrorChecks (bool bEnable=true) |
| Enables error checks using mncudaCheckError(). | |
| bool | mncudaIsErrorChecking () |
| Gets whether error checking is enabled. | |
| uint | mncudaGetMaxBlockSize (uint reqSharedMemPerThread, uint maxRegPerThread, bool useMultipleWarpSize=true) |
| Computes best thread block size for given shared memory requirement. | |
| void | mncudaInitIdentity (uint *d_buffer, uint count) |
| Initializes the given buffer with the identity relation, that is buffer[i] = i. | |
| template<class T > | |
| void | mncudaInitConstant (T *d_buffer, uint count, T constant) |
| Initializes the given buffer with a constant value. | |
| void | mncudaAddIdentity (uint *d_buffer, uint count) |
| Adds the index to all elements of the given buffer. | |
| template<MNCudaOP op, class T > | |
| void | mncudaConstantOp (T *d_array, uint count, T constant) |
| Performs constant operation on all array elements: | |
| template<class V , class S > | |
| void | mncudaScaleVectorArray (V *d_vecArray, uint count, S scalar) |
| Scales given vector array by given scalar (component wise). | |
| template<class V , class S > | |
| void | mncudaAverageArray (V *d_array, uint count, S *d_counts) |
| Averages given vector array (type of V) by dividing each element by the corresponding count. | |
| void | mncudaInverseBinary (uint *d_buffer, uint count) |
| Inverses the given "binary" 0-1-buffer. | |
| template<MNCudaOP op, class T > | |
| void | mncudaArrayOp (T *d_target, T *d_other, uint count) |
| Performs an operation on the given two arrays. | |
| template<class T > | |
| uint | mncudaResizeMNCudaMem (T **d_buffer, uint numOld, uint numRequest, uint slices=1) |
| Resizes the given buffer from numOld to at least numRequest elements. | |
| template<class T > | |
| cudaError_t | mncudaReduce (T &result, T *d_data, uint count, MNCudaOP op, T identity) |
| Performs reduction on d_data. | |
| template<class T > | |
| cudaError_t | mncudaSegmentedReduce (T *d_data, uint *d_owner, uint count, MNCudaOP op, T identity, T *d_result, uint numSegments) |
| Performs segmented reduction on d_data. | |
| template<class T > | |
| void | mncudaSetAtAddress (T *d_array, uint *d_address, T *d_vals, uint countVals) |
| Moves data from device memory d_vals to device memory d_array using target addresses specified in d_address. | |
| template<class T > | |
| void | mncudaSetConstAtAddress (T *d_array, uint *d_address, T constant, uint countVals) |
| Moves a given constant to device memory d_array at the adresses specified in d_address. | |
| template<class T > | |
| void | mncudaSetFromAddress (T *d_array, uint *d_srcAddr, T *d_vals, uint countTarget) |
| Moves data from device memory d_vals to device memory d_array using source addresses specified in d_srcAddr. | |
| void | mncudaAlignCounts (uint *d_outAligned, uint *d_counts, uint count) |
| Aligns the given count array by aligning all element counts. | |
| template<class T > | |
| cudaError_t | mncudaPrintArray (T *d_array, uint count, bool isFloat, const char *strArray=NULL) |
Prints the given array using printf. | |
| template<class T > | |
| void | mncudaCompactInplace (T *d_data, uint *d_srcAddr, uint countOld, uint countNew) |
| Compacts the data array "inplace" using a temporary buffer, the given source adresses and count. | |
| uint | mncudaGenCompactAddresses (uint *d_isValid, uint countOld, uint *d_outSrcAddr) |
| Generates compact addresses using CUDPP's compact. | |
| void | mncudaNormalize (float4 *d_vectors, uint numVectors) |
| Normalizes each element of a given vector array. | |
Provides useful macros and CUDA functions.
| #define MNCUDA_ALIGN | ( | count ) | MNCUDA_ALIGN_EX(count, 16) |
Computes the aligned element count for an alignment of 16 elements.
This is required to gain profit from coalesced access.
| count | Number of elements to align. |
| #define MNCUDA_ALIGN_BYTES | ( | size, | |
| alignment | |||
| ) |
( ( ((size) % (alignment)) == 0 ) ? \
(size) : \
((size) + (alignment) - ((size) % (alignment))) )
Computes the aligned byte size for a given alignment.
This is required to gain profit from coalesced access or assign linear memory to textures without using offsets.
| size | The size in bytes to align. |
| alignment | The alignment in bytes. |
| #define MNCUDA_ALIGN_EX | ( | count, | |
| alignment | |||
| ) |
( ( ((count) % (alignment)) == 0 ) ? \
(count) : \
((count) + (alignment) - ((count) % (alignment))) )
Computes the aligned element count, extended version.
We allow here to pass an alignment (number of elements). This can be useful when aligning counts for linear texture memory which requires a special device-dependent alignment.
| count | Number of elements to align. |
| alignment | The alignment, as a number of elements. |
| #define MNCUDA_ALIGN_NZERO | ( | count ) | MNCUDA_ALIGN_EX((((count) == 0) ? 1 : (count)), 16) |
Computes the aligned element count with special treatment for zero counts.
This version avoids zero counts by aligning them to a non-zero value.
| count | Number of elements to align. |
| #define MNCUDA_DIVUP | ( | count, | |
| chunkSize | |||
| ) | ((count) / (chunkSize) + (((count) % (chunkSize))?1:0)) |
Divides count by chunkSize and adds 1 if there is some remainder.
| count | Count to divide in chunks, e.g. number of elements. |
| chunkSize | Size of each chunk. |
| #define MNCUDA_GRID2DINDEX (blockIdx.x + (blockIdx.y*gridDim.x)) |
Calculates the one dimensional block index for the given 2D grid that was created by MNCUDA_MAKEGRID2D().
| #define MNCUDA_MAKEGRID2D | ( | numBlocks, | |
| maxBlocks | |||
| ) | dim3(min((numBlocks), (maxBlocks)), 1 + (numBlocks) / (maxBlocks), 1) |
Avoids the maximum CUDA grid size by using two grid dimensions for a one dimensional grid. I added this due to problems with exceeding 1D grid sizes.
| numBlocks | Number of thread blocks. |
| maxBlocks | Maximum size of the grid (number of thread blocks) in the first dimension. |
| enum MNCudaOP |
Operator types used in utility algorithms. These operators are usually used as binary operators, e.g. when working component-wise on two arrays.
| void __mncudaCheckErrorCUtil | ( | CUTBoolean | err, |
| const char * | file, | ||
| const int | line | ||
| ) | [inline] |
Checks for CUDA CUtil errors without synchronization. Uses MNFatal() to report the error.
| err | The error code. |
| file | The file where the error occurred. |
| line | The line where the error occurred. |
| void __mncudaSafeCallNoSync | ( | cudaError | err, |
| const char * | file, | ||
| const int | line | ||
| ) | [inline] |
Checks for CUDA errors without synchronization. Uses MNFatal() to report the error.
| err | The error code. |
| file | The file where the error occurred. |
| line | The line where the error occurred. |
Adds the index to all elements of the given buffer.
This corresponds to adding the identity relation to the elements. Here each thread works on one buffer component.
| [in,out] | d_buffer | The buffer to update. |
| count | Number of elements. |
Aligns the given count array by aligning all element counts.
d_outAligned[i] = MNCUDA_ALIGN_NZERO(d_counts[i])
This should be useful to get coalesced access when accessing offsets calculated by scanning the count array. These offsets are aligned, too. Note that th MNCUDA_ALIGN_NZERO macro is used to provide special handling of zero counts. This ensures that even zero counts would get aligned to a non-zero count and helps avoiding problems with the corresponding offsets. If this would be left out, two adjacent elements would get the same offsets. Parallel access at these offsets could create a race condition.
| [out] | d_outAligned | The aligned counts array. Has to be preallocated. |
| [in] | d_counts | The counts array. |
| count | Number of elements (counts). |
| void mncudaArrayOp | ( | T * | d_target, |
| T * | d_other, | ||
| uint | count | ||
| ) |
Performs an operation on the given two arrays.
d_target[i] = d_target[i] op d_other[i]
Here each thread handles one component i.
| op | Operator type. |
| T | Element type. |
| [in,out] | d_target | Target array. |
| [in] | d_other | Other array. |
| count | Number of elements in both arrays. |
| void mncudaAverageArray | ( | V * | d_array, |
| uint | count, | ||
| S * | d_counts | ||
| ) |
Averages given vector array (type of V) by dividing each element by the corresponding count.
d_array[i] = d_array[i] / d_counts[i]
If d_counts[i] is zero, d_array[i] stays unchanged. Each spawned thread works on one component of the buffers.
| V | Vector type. |
| S | Scalar type. |
| [in,out] | d_array | The array of vectors to scale. |
| count | Number of elements in both d_array and d_counts. | |
| [in] | d_counts | Counts used for averaging. |
| cudaError_t mncudaCheckError | ( | bool | bForce ) |
Checks for pending errors and returns them.
A synchronization is required to get all such errors. Hence this might be too costly for release mode. Therefore I added a way to make these checks optional in release mode, more precisely controllable using mncudaEnableErrorChecks().
| bForce | true to force error check. Is used in debug mode, see MNCUDA_CHECKERROR, to enforce error checks, even if disabled. If false is passed, the error check is performed only if checks are enabled. |
cudaThreadSynchronize(), if check is performed. Else cudaSuccess is returned. | void mncudaCompactInplace | ( | T * | d_data, |
| uint * | d_srcAddr, | ||
| uint | countOld, | ||
| uint | countNew | ||
| ) |
Compacts the data array "inplace" using a temporary buffer, the given source adresses and count.
Enables to compact a structure of arrays using only one real compact and multiple set from addresses.
| T | Element type. |
| [in,out] | d_data | The data array to compact. Operation is performed "inplace". |
| [in] | d_srcAddr | The source addresses. |
| countOld | The original count (elements in d_data). | |
| countNew | The new count (elements in d_srcAddr). Note that this should normally be available, as it will be retrieved from the source address generation. |
| void mncudaConstantOp | ( | T * | d_array, |
| uint | count, | ||
| T | constant | ||
| ) |
Performs constant operation on all array elements:
d_array[i] = d_array[i] op constant
Each spawned thread works on one array component.
| op | Operator type. |
| T | Element type. |
| [in,out] | d_array | The array to manipulate. |
| count | Number of elements. | |
| constant | The constant. |
| void mncudaEnableErrorChecks | ( | bool | bEnable ) |
Enables error checks using mncudaCheckError().
This function is useful in release mode only, as in debug mode, error checks are forced.
| bEnable | true to enable, false to disable. |
Generates compact addresses using CUDPP's compact.
These can be used to compact data corresponding to d_isValid without using CUDPP's compact, but using mncudaCompactInplace(). To compact a structure of arrays, you'd have to call this once and mncudaCompactInplace() for each array. In my tests I observed that this is much more efficient than multiple cudppCompact calls.
| [in] | d_isValid | Contains 1 if entry is valid, 0 if it should be dropped. |
| countOld | Old count before compacting. | |
| [out] | d_outSrcAddr | The source address array. Device memory provided by caller. |
| uint mncudaGetMaxBlockSize | ( | uint | reqSharedMemPerThread, |
| uint | maxRegPerThread, | ||
| bool | useMultipleWarpSize | ||
| ) |
Computes best thread block size for given shared memory requirement.
Thread block size is limited due to maximum shared memory per block. This maximum depends on the actual CUDA GPU.
| reqSharedMemPerThread | Required shared memory per thread in bytes. |
| maxRegPerThread | Maximum number of registers per thread. |
| useMultipleWarpSize | Whether to round the maximum to the next multiple of the device's warp size. |
| void mncudaInitConstant | ( | T * | d_buffer, |
| uint | count, | ||
| T | constant | ||
| ) |
Initializes the given buffer with a constant value.
Each spawned thread works on one array component.
| T | Element type of the buffer. |
| [in,out] | d_buffer | Device buffer to initialize. |
| count | Number of elements. | |
| constant | The constant. |
Initializes the given buffer with the identity relation, that is buffer[i] = i.
Each component is handled by it's own CUDA thread.
| [in,out] | d_buffer | Device buffer to initialize. |
| count | The number of elements in d_buffer. |
Inverses the given "binary" 0-1-buffer.
This is done by setting all one (1) components to zero (0) and all zero components to one respectively. Each thread handles one component of the buffer.
| [in,out] | d_buffer | The binary buffer to inverse. |
| count | Number of elements in d_buffer. |
| bool mncudaIsErrorChecking | ( | ) |
Gets whether error checking is enabled.
true if enabled, else false. | void mncudaNormalize | ( | float4 * | d_vectors, |
| uint | numVectors | ||
| ) |
Normalizes each element of a given vector array.
| [in,out] | d_vectors | The vector array to normalize. Array is given as float4 array of three-dimensional vectors to ensure alignment, where the actual vector is in the xyz components. |
| numVectors | Number of vectors. |
| cudaError_t mncudaPrintArray | ( | T * | d_array, |
| uint | count, | ||
| bool | isFloat, | ||
| const char * | strArray | ||
| ) |
Prints the given array using printf.
For debugging purposes only. Too slow for release code.
| T | Element type. |
| [in] | d_array | The device array to print out. |
| count | Number of elements of d_array to print. | |
| isFloat | Whether T is floating point type. | |
| strArray | The name of the array. Used for printing purposes. Might be NULL. |
cudaSuccess if OK, else some error. | cudaError_t mncudaReduce | ( | T & | result, |
| T * | d_data, | ||
| uint | count, | ||
| MNCudaOP | op, | ||
| T | identity | ||
| ) |
Performs reduction on d_data.
d_data remains unchanged. The reduction depends on the passed operator. The reduction algorithm is implemented with the help of the CUDA SDK sample.
| T | Element type of input and output arrays. |
| [out] | result | Reduction result. Single element of type T. |
| [in] | d_data | Data to reduce, remains unchanged. |
| count | Number of elements in d_data. | |
| op | The reduction operator. One of MNCuda_ADD, MNCuda_MIN, MNCuda_MAX. | |
| identity | Identity value associated with op. |
| uint mncudaResizeMNCudaMem | ( | T ** | d_buffer, |
| uint | numOld, | ||
| uint | numRequest, | ||
| uint | slices | ||
| ) |
Resizes the given buffer from numOld to at least numRequest elements.
Copies the contents of the old buffer to the beginning of the new buffer. Works only for MNCudaMemPool buffers.
Additionally, the buffer can be organized into slices of contiguously placed elements. When having slices > 1, it is assumed that d_buffer has numOld times slices elements before the call. The buffer is resized so that each slice is resized to at least numRequest elements.
| T | Element type of buffer. |
| [in,out] | d_buffer | The old buffer. After execution it is free'd and replaced by the new buffer. Has to be valid MNCudaMemPool memory. |
| numOld | Old number of elements in a single slice. | |
| numRequest | Requested new number of elements in a single slice. | |
| slices | Number of slices. Usually 1, however you might have a buffer consiting of multiple slices of numOld elements and want to resize it to multiple slices of at least numNew elements. |
| void mncudaScaleVectorArray | ( | V * | d_vecArray, |
| uint | count, | ||
| S | scalar | ||
| ) |
Scales given vector array by given scalar (component wise).
Each spawned thread works on one array component.
| V | Vector type. |
| S | Scalar type. |
| [in,out] | d_vecArray | The array of vectors to scale. |
| count | Number of vectors. | |
| scalar | The scalar. |
| cudaError_t mncudaSegmentedReduce | ( | T * | d_data, |
| uint * | d_owner, | ||
| uint | count, | ||
| MNCudaOP | op, | ||
| T | identity, | ||
| T * | d_result, | ||
| uint | numSegments | ||
| ) |
Performs segmented reduction on d_data.
Segments are defined by d_owner, where d_owner[i] contains the segment of d_data[i]. The result is put into d_result. This array has to be preallocated and should have space for all segment results.
Algorithmic idea: http://www.nvidia.com/object/nvidia_research_pub_013.html.
| T | Element type of d_data. |
| [in] | d_data | The segmented data to perform reduction on. Count A. It is assumed that the data of the same segment is stored contiguously. |
| [in] | d_owner | The data-segment association list. Count A. |
| count | Defines A, that is the count in d_data and d_owner. | |
| op | The reduction operator. One of MNCuda_ADD, MNCuda_MIN, MNCuda_MAX. | |
| identity | Identity value associated with op. | |
| [out] | d_result | Takes the result, that is the reduction result of each segment. Size of B <= A. |
| numSegments | Defines B, the number of segments. |
| void mncudaSetAtAddress | ( | T * | d_array, |
| uint * | d_address, | ||
| T * | d_vals, | ||
| uint | countVals | ||
| ) |
Moves data from device memory d_vals to device memory d_array using target addresses specified in d_address.
d_array[d_address[i]] = d_vals[i]
| T | Element type of array and values. |
| [in,out] | d_array | The data array to manipulate. |
| [in] | d_address | Addresses of the values to manipulate. It is assumed that all addresses d_address[i] are valid with respect to d_array. |
| [in] | d_vals | The values to fill in. |
| countVals | Number of values in d_address and d_vals. |
| void mncudaSetConstAtAddress | ( | T * | d_array, |
| uint * | d_address, | ||
| T | constant, | ||
| uint | countVals | ||
| ) |
Moves a given constant to device memory d_array at the adresses specified in d_address.
d_array[d_address[i]] = constant
| T | Element type of array and constant. |
| [in,out] | d_array | The data array to manipulate. |
| [in] | d_address | Addresses of the values to manipulate. |
| constant | The constant to move to d_array. | |
| countVals | Number of values in d_address. |
| void mncudaSetFromAddress | ( | T * | d_array, |
| uint * | d_srcAddr, | ||
| T * | d_vals, | ||
| uint | countTarget | ||
| ) |
Moves data from device memory d_vals to device memory d_array using source addresses specified in d_srcAddr.
d_array[i] = d_vals[d_srcAddr[i]]
When the source address is 0xffffffff, the corresponding target entry will get zero'd. This can be helpful for some algorithms.
| T | Element type of array and values. |
| [in,out] | d_array | The data array to manipulate. |
| [in] | d_srcAddr | Addresses of the source values in d_vals. Use 0xffffffff to zero the corresponding entry in d_array. |
| [in] | d_vals | The values to fill in. |
| countTarget | Number of values in d_array and d_srcAddr. |
| MNRT Source Code Documentation (Version 1.0) - Copyright © Mathias Neumann 2010 |
Generated on Tue Nov 30 2010 14:28:27 for MNRT by 1.7.2
|