diff --git a/src/core/cuda/reduction-operations.h b/src/core/cuda/reduction-operations.h new file mode 100644 index 0000000000000000000000000000000000000000..de7488f5864cd906618e9c0258ed20a9b67180ca --- /dev/null +++ b/src/core/cuda/reduction-operations.h @@ -0,0 +1,226 @@ +/*************************************************************************** + reduction-operations.h - description + ------------------- + begin : Mar 22, 2013 + copyright : (C) 2013 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#ifndef REDUCTION_OPERATIONS_H_ +#define REDUCTION_OPERATIONS_H_ + +#ifdef HAVE_CUDA +#include <cuda.h> +#include <core/mfuncs.h> + +enum tnlTupleOperation { tnlParallelReductionMax, + tnlParallelReductionAbsMin, + tnlParallelReductionAbsMax, + tnlParallelReductionAbsSum, + tnlParallelReductionLpNorm, + tnlParallelReductionSdot }; + + +/*** + * This function returns minimum of two numbers stored on the device. + */ +template< class T > __device__ T tnlCudaMin( const T& a, + const T& b ) +{ + return a < b ? a : b; +} + +__device__ int tnlCudaMin( const int& a, + const int& b ) +{ + return min( a, b ); +} + +__device__ float tnlCudaMin( const float& a, + const float& b ) +{ + return fminf( a, b ); +} + +__device__ double tnlCudaMin( const double& a, + const double& b ) +{ + return fmin( a, b ); +} + +/*** + * This function returns maximum of two numbers stored on the device. + */ +template< class T > __device__ T tnlCudaMax( const T& a, + const T& b ) +{ + return a > b ? a : b; +} + +__device__ int tnlCudaMax( const int& a, + const int& b ) +{ + return max( a, b ); +} + +__device__ float tnlCudaMax( const float& a, + const float& b ) +{ + return fmaxf( a, b ); +} + +__device__ double tnlCudaMax( const double& a, + const double& b ) +{ + return fmax( a, b ); +} + +/*** + * This function returns absolute value of given number on the device. + */ +__device__ int tnlCudaAbs( const int& a ) +{ + return abs( a ); +} + +__device__ float tnlCudaAbs( const float& a ) +{ + return fabs( a ); +} + +__device__ double tnlCudaAbs( const double& a ) +{ + return fabs( a ); +} + +template< typename Real, typename Index > +class tnlParallelReductionSum +{ + public: + + typedef Real RealType; + typedef Index IndexType; + + + RealType reduceOnHost( const RealType& data1, + const RealType& data2 ) const + { + return data1 + data2; + } + + __device__ RealType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx1 ] + data1[ idx2 ]; + } + + __device__ RealType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx1 ]; + }; + + __device__ RealType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const RealType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ]; + }; + + __device__ RealType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] + data2[ idx2 ]; + }; + + __device__ RealType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data ) const + { + return data[ idx1 ] + data[ idx2 ]; + }; +}; + +template< typename Real, typename Index > +class tnlParallelReductionMin +{ + public: + + typedef Real RealType; + typedef Index IndexType; + + + RealType reduceOnHost( const RealType& data1, + const RealType& data2 ) const + { + return Min( data1, data2 ); + } + + __device__ RealType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const + { + return tnlCudaMin( data1[ idx1 ], data1[ idx2 ] ); + } + + __device__ RealType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx1 ]; + }; + + __device__ RealType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const RealType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return tnlCudaMin( data1[ idx1 ], tnlCudaMin( data2[ idx2 ], data2[ idx3 ] ) ); + }; + + __device__ RealType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] ); + }; + + __device__ RealType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data ) const + { + return tnlCudaMin( data[ idx1 ], data[ idx2 ] ); + }; +}; + + + + +#include <implementation/core/cuda/reduction-operations_impl.h> + +#endif + +#endif /* REDUCTION_OPERATIONS_H_ */ diff --git a/src/implementation/core/CMakeLists.txt b/src/implementation/core/CMakeLists.txt index 29eba8ff9682f270236623a19c54262684dc7133..10e99346155a315c0daf053c3bd1375a9e6fdd5b 100755 --- a/src/implementation/core/CMakeLists.txt +++ b/src/implementation/core/CMakeLists.txt @@ -21,8 +21,10 @@ SET( headers cuda-long-vector-kernels.h SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/implementation/core ) IF( BUILD_CUDA ) set( tnl_implementation_core_CUDA__SOURCES + ${tnl_implementation_core_cuda_CUDA__SOURCES} ${CURRENT_DIR}/tnlArray_impl.cu - ${CURRENT_DIR}/tnlVector_impl.cu ) + ${CURRENT_DIR}/tnlVector_impl.cu + PARENT_SCOPE ) ENDIF() set( tnl_implementation_core_SOURCES ${tnl_implementation_core_CUDA__SOURCES} diff --git a/src/implementation/core/cuda-long-vector-kernels.h b/src/implementation/core/cuda-long-vector-kernels.h index 388afec7010a0e44defbd82785d69b5337011b06..c9f49847ff2b7855b3bd5e4b8cb32596c20bc0fe 100644 --- a/src/implementation/core/cuda-long-vector-kernels.h +++ b/src/implementation/core/cuda-long-vector-kernels.h @@ -22,18 +22,12 @@ #include <cuda.h> #endif #include <iostream> -#include <core/tnlVector.h> +#include <core/tnlAssert.h> +#include <core/cuda/reduction-operations.h> +#include <implementation/core/memory-operations.h> using namespace std; -enum tnlTupleOperation { tnlParallelReductionMin = 1, - tnlParallelReductionMax, - tnlParallelReductionSum, - tnlParallelReductionAbsMin, - tnlParallelReductionAbsMax, - tnlParallelReductionAbsSum, - tnlParallelReductionLpNorm, - tnlParallelReductionSdot }; /**** * This constant says that arrays smaller than its value @@ -41,107 +35,24 @@ enum tnlTupleOperation { tnlParallelReductionMin = 1, */ const int maxGPUReductionDataSize = 256; -/**** - * The following kernels and functions have been adopted from - * - * M. Harris, “Optimizing parallel reduction in cuda,” NVIDIA CUDA SDK, 2007. - * - * The code was extended even for data arrays with size different from - * a power of 2. - * - * For the educative and also testing/debugging reasons we have 6 version of this algorithm. - * The slower version can be found as a part o ftesting code. See directory tests. - * Version 1 is the slowest and version 6 is the fastest - teste on CUDA architecture 1.0 - 1.3. - * Another improvements are possible for the future devices. - * - */ - #ifdef HAVE_CUDA -/*** - * This function returns minimum of two numbers stored on the device. - */ -template< class T > __device__ T tnlCudaMin( const T& a, - const T& b ) -{ - return a < b ? a : b; -} - -__device__ int tnlCudaMin( const int& a, - const int& b ) -{ - return min( a, b ); -} - -__device__ float tnlCudaMin( const float& a, - const float& b ) -{ - return fminf( a, b ); -} - -__device__ double tnlCudaMin( const double& a, - const double& b ) -{ - return fmin( a, b ); -} - -/*** - * This function returns maximum of two numbers stored on the device. - */ -template< class T > __device__ T tnlCudaMax( const T& a, - const T& b ) -{ - return a > b ? a : b; -} - -__device__ int tnlCudaMax( const int& a, - const int& b ) -{ - return max( a, b ); -} - -__device__ float tnlCudaMax( const float& a, - const float& b ) -{ - return fmaxf( a, b ); -} - -__device__ double tnlCudaMax( const double& a, - const double& b ) -{ - return fmax( a, b ); -} -/*** - * This function returns absolute value of given number on the device. - */ -__device__ int tnlCudaAbs( const int& a ) -{ - return abs( a ); -} - -__device__ float tnlCudaAbs( const float& a ) -{ - return fabs( a ); -} - -__device__ double tnlCudaAbs( const double& a ) -{ - return fabs( a ); -} /*** * For each thread in block with thread ID smaller then s this function reduces * data elements with indecis tid and tid + s. Here we assume that for each * tid the tid + s element also exists i.e. we have even number of elements. */ -template< class T, tnlTupleOperation operation > -__device__ void reduceAligned( unsigned int tid, - unsigned int s, - T* sdata ) +template< typename Operation > +__device__ void reduceAligned( const Operation& operation, + typename Operation :: IndexType tid, + typename Operation :: IndexType s, + typename Operation :: RealType* sdata ) { if( tid < s ) { - if( operation == tnlParallelReductionMin ) + sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata ); + /*if( operation == tnlParallelReductionMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlParallelReductionMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); @@ -153,7 +64,7 @@ __device__ void reduceAligned( unsigned int tid, sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); if( operation == tnlParallelReductionLpNorm || operation == tnlParallelReductionSdot ) - sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ]; + sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ];*/ } } @@ -163,15 +74,17 @@ __device__ void reduceAligned( unsigned int tid, * the previous algorithm. Thid one works even for odd number of elements but * it is a bit slower. */ -template< class T, tnlTupleOperation operation > -__device__ void reduceNonAligned( unsigned int tid, - unsigned int s, - unsigned int n, - T* sdata ) +template< typename Operation > +__device__ void reduceNonAligned( const Operation& operation, + typename Operation :: IndexType tid, + typename Operation :: IndexType s, + typename Operation :: IndexType n, + typename Operation :: RealType* sdata ) { if( tid < s ) { - if( operation == tnlParallelReductionMin ) + sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata ); + /*if( operation == tnlParallelReductionMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlParallelReductionMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); @@ -183,7 +96,7 @@ __device__ void reduceNonAligned( unsigned int tid, sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); if( operation == tnlParallelReductionLpNorm || operation == tnlParallelReductionSdot ) - sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ]; + sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ];*/ } /* This is for the case when we have odd number of elements. * The last one will be reduced using the thread with ID 0. @@ -192,7 +105,8 @@ __device__ void reduceNonAligned( unsigned int tid, __syncthreads(); if( 2 * s < n && tid == n - 1 ) { - if( operation == tnlParallelReductionMin ) + sdata[ 0 ] = operation. commonReductionOnDevice( 0, tid, sdata ); + /*if( operation == tnlParallelReductionMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlParallelReductionMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); @@ -204,7 +118,7 @@ __device__ void reduceNonAligned( unsigned int tid, sdata[ 0 ] = tnlCudaMax( tnlCudaAbs( sdata[ 0 ] ), tnlCudaAbs( sdata[ tid + s ] ) ); if( operation == tnlParallelReductionLpNorm || operation == tnlParallelReductionSdot ) - sdata[ 0 ] = sdata[ 0 ] + sdata[ tid + s ]; + sdata[ 0 ] = sdata[ 0 ] + sdata[ tid + s ];*/ } } @@ -224,16 +138,18 @@ __device__ void reduceNonAligned( unsigned int tid, * Each block of the grid writes one element in this array * (i.e. the size of this array equals the number of CUDA blocks). */ -template < typename Type, typename ParameterType, typename Index, tnlTupleOperation operation, int blockSize > -__global__ void tnlCUDAReductionKernel( const Index size, - const Type* deviceInput, - const Type* deviceInput2, - Type* deviceOutput, - const ParameterType parameter, - Type* dbg_array1 = 0 ) +template < typename Operation, int blockSize > +__global__ void tnlCUDAReductionKernel( const Operation operation, + const typename Operation :: IndexType size, + const typename Operation :: RealType* deviceInput, + const typename Operation :: RealType* deviceInput2, + typename Operation :: RealType* deviceOutput ) { extern __shared__ __align__ ( 8 ) char __sdata[]; - Type* sdata = reinterpret_cast< Type* >( __sdata ); + + typedef typename Operation :: IndexType IndexType; + typedef typename Operation :: RealType RealType; + RealType* sdata = reinterpret_cast< RealType* >( __sdata ); /*** * Get thread id (tid) and global thread id (gid). @@ -241,10 +157,10 @@ __global__ void tnlCUDAReductionKernel( const Index size, * gridSize is the number of element processed by all blocks at the * same time. */ - unsigned int tid = threadIdx. x; - unsigned int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x; - unsigned int lastTId = size - 2 * blockIdx. x * blockDim. x; - unsigned int gridSize = 2 * blockDim. x * gridDim.x; + IndexType tid = threadIdx. x; + IndexType gid = 2 * blockIdx. x * blockDim. x + threadIdx. x; + IndexType lastTId = size - 2 * blockIdx. x * blockDim. x; + IndexType gridSize = 2 * blockDim. x * gridDim.x; /*** * Read data into the shared memory. We start with the @@ -252,7 +168,8 @@ __global__ void tnlCUDAReductionKernel( const Index size, */ if( gid + blockDim. x < size ) { - if( operation == tnlParallelReductionMin ) + sdata[ tid ] = operation. initialValueOnDevice( gid, gid + blockDim. x, deviceInput, deviceInput2 ); + /*if( operation == tnlParallelReductionMin ) sdata[ tid ] = tnlCudaMin( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ); if( operation == tnlParallelReductionMax ) sdata[ tid ] = tnlCudaMax( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ); @@ -267,22 +184,24 @@ __global__ void tnlCUDAReductionKernel( const Index size, powf( tnlCudaAbs( deviceInput[ gid + blockDim. x ] ), parameter ); if( operation == tnlParallelReductionSdot ) sdata[ tid ] = deviceInput[ gid ] * deviceInput2[ gid ] + - deviceInput[ gid + blockDim. x ] * deviceInput2[ gid + blockDim. x ]; + deviceInput[ gid + blockDim. x ] * deviceInput2[ gid + blockDim. x ];*/ } else if( gid < size ) { - if( operation == tnlParallelReductionLpNorm ) + sdata[ tid ] = operation. initialValueOnDevice( gid, deviceInput, deviceInput2 ); + /*if( operation == tnlParallelReductionLpNorm ) sdata[ tid ] = powf( tnlCudaAbs( deviceInput[ gid ] ), parameter ); else if( operation == tnlParallelReductionSdot ) sdata[ tid ] = deviceInput[ gid ] * deviceInput2[ gid ]; else - sdata[ tid ] = deviceInput[ gid ]; + sdata[ tid ] = deviceInput[ gid ];*/ } gid += gridSize; while( gid + blockDim. x < size ) { - if( operation == tnlParallelReductionMin ) + sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, gid + blockDim. x, sdata, deviceInput, deviceInput2 ); + /*if( operation == tnlParallelReductionMin ) sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], :: tnlCudaMin( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ) ); if( operation == tnlParallelReductionMax ) sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ) ); @@ -297,12 +216,13 @@ __global__ void tnlCUDAReductionKernel( const Index size, powf( tnlCudaAbs( deviceInput[ gid + blockDim. x ] ), parameter ); if( operation == tnlParallelReductionSdot ) sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ] + - deviceInput[ gid + blockDim. x] * deviceInput2[ gid + blockDim. x ]; + deviceInput[ gid + blockDim. x] * deviceInput2[ gid + blockDim. x ];*/ gid += gridSize; } if( gid < size ) { - if( operation == tnlParallelReductionMin ) + sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, sdata, deviceInput, deviceInput2 ); + /*if( operation == tnlParallelReductionMin ) sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], deviceInput[ gid ] ); if( operation == tnlParallelReductionMax ) sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], deviceInput[ gid ] ); @@ -315,7 +235,7 @@ __global__ void tnlCUDAReductionKernel( const Index size, if( operation == tnlParallelReductionLpNorm ) sdata[ tid ] += powf( tnlCudaAbs( deviceInput[gid] ), parameter ); if( operation == tnlParallelReductionSdot ) - sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ]; + sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ];*/ } __syncthreads(); @@ -337,19 +257,19 @@ __global__ void tnlCUDAReductionKernel( const Index size, if( blockSize >= 512 ) { if( tid < 256 ) - reduceAligned< Type, operation >( tid, 256, sdata ); + reduceAligned( operation, tid, 256, sdata ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) - reduceAligned< Type, operation >( tid, 128, sdata ); + reduceAligned( operation, tid, 128, sdata ); __syncthreads(); } if( blockSize >= 128 ) { if( tid < 64 ) - reduceAligned< Type, operation >( tid, 64, sdata ); + reduceAligned( operation, tid, 64, sdata ); __syncthreads(); } @@ -359,17 +279,17 @@ __global__ void tnlCUDAReductionKernel( const Index size, if (tid < 32) { if( blockSize >= 64 ) - reduceAligned< Type, operation >( tid, 32, sdata ); + reduceAligned( operation, tid, 32, sdata ); if( blockSize >= 32 ) - reduceAligned< Type, operation >( tid, 16, sdata ); + reduceAligned( operation, tid, 16, sdata ); if( blockSize >= 16 ) - reduceAligned< Type, operation >( tid, 8, sdata ); + reduceAligned( operation, tid, 8, sdata ); if( blockSize >= 8 ) - reduceAligned< Type, operation >( tid, 4, sdata ); + reduceAligned( operation, tid, 4, sdata ); if( blockSize >= 4 ) - reduceAligned< Type, operation >( tid, 2, sdata ); + reduceAligned( operation, tid, 2, sdata ); if( blockSize >= 2 ) - reduceAligned< Type, operation >( tid, 1, sdata ); + reduceAligned( operation, tid, 1, sdata ); } } else @@ -378,35 +298,35 @@ __global__ void tnlCUDAReductionKernel( const Index size, if( n >= 512 ) { s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); + reduceNonAligned( operation, tid, s, n, sdata ); n = s; __syncthreads(); } if( n >= 256 ) { s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); + reduceNonAligned( operation, tid, s, n, sdata ); n = s; __syncthreads(); } if( n >= 128 ) { s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); + reduceNonAligned( operation, tid, s, n, sdata ); n = s; __syncthreads(); } if( n >= 64 ) { s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); + reduceNonAligned( operation, tid, s, n, sdata ); n = s; __syncthreads(); } if( n >= 32 ) { s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); + reduceNonAligned( operation, tid, s, n, sdata ); n = s; __syncthreads(); } @@ -416,25 +336,25 @@ __global__ void tnlCUDAReductionKernel( const Index size, if( n >= 16 ) { s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); + reduceNonAligned( operation, tid, s, n, sdata ); n = s; } if( n >= 8 ) { s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); + reduceNonAligned( operation, tid, s, n, sdata ); n = s; } if( n >= 4 ) { s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); + reduceNonAligned( operation, tid, s, n, sdata ); n = s; } if( n >= 2 ) { s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); + reduceNonAligned( operation, tid, s, n, sdata ); n = s; } @@ -468,15 +388,18 @@ __global__ void tnlCUDAReductionKernel( const Index size, * allocation of this array on the device inside of this function. * The size of this array should be size / 128 * sizeof( T ). */ -template< typename Type, typename ParameterType, typename Index, tnlTupleOperation operation > -bool tnlCUDALongVectorReduction( const Index size, - const Type* deviceInput1, - const Type* deviceInput2, - Type& result, - const ParameterType& parameter, - Type* deviceAux = 0 ) +template< typename Operation > +bool tnlCUDALongVectorReduction( const Operation& operation, + const typename Operation :: IndexType size, + const typename Operation :: RealType* deviceInput1, + const typename Operation :: RealType* deviceInput2, + typename Operation :: RealType& result, + typename Operation :: RealType* deviceAux = 0 ) { #ifdef HAVE_CUDA + + typedef typename Operation :: IndexType IndexType; + typedef typename Operation :: RealType RealType; /*** * Set parameters: * @param desBlockSize is desired block size with which we get the best performance (on CUDA rach 1.0 to 1.3) @@ -485,7 +408,7 @@ bool tnlCUDALongVectorReduction( const Index size, const int desBlockSize = 512; const int desGridSize = 2048; - Type* dbg_array1; // debuging array + RealType* dbg_array1; // debuging array /*** * Allocating auxiliary device memory to store temporary reduced arrays. @@ -494,13 +417,15 @@ bool tnlCUDALongVectorReduction( const Index size, * If one calls the CUDA reduction more then once then one can provide * auxiliary array by passing it via the parameter deviceAux. */ - tnlVector< Type, tnlCuda > deviceAuxVct( "tnlCUDAOneVectorReduction:deviceAuxVct" ); + RealType* myDeviceAux( 0 ); if( ! deviceAux ) { - int sizeAlloc = :: Max( 1, size / desBlockSize ); - if( ! deviceAuxVct. setSize( sizeAlloc ) ) + if( ! allocateMemoryCuda( myDeviceAux, :: Max( 1, size / desBlockSize ) ) ) + { + checkCudaDevice; return false; - deviceAux = deviceAuxVct. getData(); + } + deviceAux = myDeviceAux; } /*** @@ -509,26 +434,27 @@ bool tnlCUDALongVectorReduction( const Index size, * @param reductionInput tells what data we shell reduce. We start with the input if this fuction * and after the 1st reduction step we switch this pointer to deviceAux. */ - int sizeReduced = size; - const Type* reductionInput1 = deviceInput1; - const Type* reductionInput2 = deviceInput2; - int reductionSteps( 0 ); + IndexType sizeReduced = size; + const RealType* reductionInput1 = deviceInput1; + const RealType* reductionInput2 = deviceInput2; + IndexType reductionSteps( 0 ); while( sizeReduced > maxGPUReductionDataSize ) { dim3 blockSize( 0 ), gridSize( 0 ); blockSize. x = :: Min( sizeReduced, desBlockSize ); - gridSize. x = :: Min( ( int ) ( sizeReduced / blockSize. x + 1 ) / 2, desGridSize ); + gridSize. x = :: Min( ( IndexType ) ( sizeReduced / blockSize. x + 1 ) / 2, desGridSize ); /*** * We align the blockSize to the power of 2. */ - Index alignedBlockSize = 1; + IndexType alignedBlockSize = 1; while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1; blockSize. x = alignedBlockSize; - Index shmem = blockSize. x * sizeof( Type ); + IndexType shmem = blockSize. x * sizeof( RealType ); /*** * Depending on the blockSize we generate appropriate template instance. */ +#ifdef UNDEF if( reductionSteps > 0 && ( operation == tnlParallelReductionSdot || operation == tnlParallelReductionLpNorm ) ) @@ -584,43 +510,44 @@ bool tnlCUDALongVectorReduction( const Index size, } } else +#endif switch( blockSize. x ) { case 512: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 512 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); + tnlCUDAReductionKernel< Operation, 512 > + <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); break; case 256: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 256 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); + tnlCUDAReductionKernel< Operation, 256 > + <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); break; case 128: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 128 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); + tnlCUDAReductionKernel< Operation, 128 > + <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); break; case 64: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 64 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); + tnlCUDAReductionKernel< Operation, 64 > + <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); break; case 32: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 32 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); + tnlCUDAReductionKernel< Operation, 32 > + <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); break; case 16: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 16 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); + tnlCUDAReductionKernel< Operation, 16 > + <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); break; case 8: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 8 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); + tnlCUDAReductionKernel< Operation, 8 > + <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); break; case 4: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 4 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); + tnlCUDAReductionKernel< Operation, 4 > + <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); break; case 2: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 2 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); + tnlCUDAReductionKernel< Operation, 2 > + <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux ); break; case 1: tnlAssert( false, cerr << "blockSize should not be 1." << endl ); @@ -639,15 +566,13 @@ bool tnlCUDALongVectorReduction( const Index size, * If sizeReduced equals size the previous loop was not processed and we read * data directly from the input. */ - Type result_array[ maxGPUReductionDataSize ]; - Type result_array2[ maxGPUReductionDataSize ]; + RealType result_array[ maxGPUReductionDataSize ]; + RealType result_array2[ maxGPUReductionDataSize ]; if( sizeReduced == size ) { - if( cudaMemcpy( result_array, deviceInput1, sizeReduced * sizeof( Type ), cudaMemcpyDeviceToHost ) != cudaSuccess ) - { - CHECK_CUDA_ERROR; + if( ! copyMemoryCudaToHost( result_array, deviceInput1, sizeReduced ) ) return false; - } +#ifdef UNDEF switch( operation ) { case tnlParallelReductionLpNorm: @@ -657,10 +582,8 @@ bool tnlCUDALongVectorReduction( const Index size, result = pow( result, 1.0/ parameter ); return true; case tnlParallelReductionSdot: - if( cudaMemcpy( result_array2, deviceInput2, sizeReduced * sizeof( Type ), cudaMemcpyDeviceToHost ) != cudaSuccess ) - { - CHECK_CUDA_ERROR; - } + if( ! copyMemoryCudaToHost( result_array2, deviceInput2, sizeReduced ) ) + return false; else { result = 0; @@ -669,14 +592,15 @@ bool tnlCUDALongVectorReduction( const Index size, return true; } } +#endif } else - if( cudaMemcpy( result_array, deviceAux, sizeReduced * sizeof( Type ), cudaMemcpyDeviceToHost ) != cudaSuccess ) - { - CHECK_CUDA_ERROR; + if( ! copyMemoryCudaToHost( result_array, deviceAux, sizeReduced ) ) return false; - } - switch( operation ) + result = result_array[ 0 ]; + for( IndexType i = 1; i < sizeReduced; i ++ ) + result = operation. reduceOnHost( result, result_array[ i ] ); + /*switch( operation ) { case tnlParallelReductionMax: result = result_array[ 0 ]; @@ -705,6 +629,12 @@ bool tnlCUDALongVectorReduction( const Index size, for( Index i = 1; i < sizeReduced; i ++ ) result = Min( result, tnlAbs( result_array[ i ] ) ); break; + }*/ + if( myDeviceAux ) + { + freeMemoryCuda( myDeviceAux ); + if( ! checkCudaDevice ) + return false; } return true; #else @@ -759,6 +689,7 @@ __global__ void compareTwoVectorsElementwise( const Index size, * Result is written into a bool array. The minimum value then says if both vectors equal. * */ +// TODO: result of comparison should not be returned!!! template< typename Type, typename Index > bool tnlCUDALongVectorComparison( const Index size, const Type* deviceInput1, @@ -770,12 +701,14 @@ bool tnlCUDALongVectorComparison( const Index size, tnlAssert( size > 0, cerr << "You try to compare two CUDA long vectors with non-positive size." << endl << "The size is " << size ); - tnlVector< bool, tnlCuda, Index > boolArray( "tnlCUDALongVectorComparison:bool_array" ); + //tnlVector< bool, tnlCuda, Index > boolArray( "tnlCUDALongVectorComparison:bool_array" ); + bool* myDeviceBoolAux( 0 ); if( ! deviceBoolAux ) { - if( ! boolArray. setSize( size ) ) + //if( ! boolArray. setSize( size ) ) + if( ! allocateMemoryCuda( myDeviceBoolAux, size ) ) return false; - deviceBoolAux = boolArray. getData(); + deviceBoolAux = myDeviceBoolAux; } dim3 blockSize( 0 ), gridSize( 0 ); blockSize. x = 256; @@ -785,7 +718,8 @@ bool tnlCUDALongVectorComparison( const Index size, deviceInput1, deviceInput2, deviceBoolAux ); - CHECK_CUDA_ERROR; + if( ! checkCudaDevice ) + return false; bool result; if( ! tnlCUDALongVectorReduction< bool, bool, Index, tnlParallelReductionMin >( size, deviceBoolAux, diff --git a/src/implementation/core/cuda/CMakeLists.txt b/src/implementation/core/cuda/CMakeLists.txt index 266346483cd29086b07e2d774dd0a41755095092..5e9be45d30fbf42d70a12cf53528d2dd800be122 100755 --- a/src/implementation/core/cuda/CMakeLists.txt +++ b/src/implementation/core/cuda/CMakeLists.txt @@ -1,7 +1,14 @@ -SET( headers reduction_impl.h ) +SET( headers reduction_impl.h + reduction-operations_impl.h ) SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/implementation/core/cuda ) +IF( BUILD_CUDA ) + set( tnl_implementation_core_cuda_CUDA__SOURCES + ${CURRENT_DIR}/reduction-operations_impl.cu + PARENT_SCOPE ) +endif() + set( tnl_implementation_core_cuda_SOURCES ${CURRENT_DIR}/device-check.cpp PARENT_SCOPE ) diff --git a/src/implementation/core/cuda/reduction-operations_impl.cu b/src/implementation/core/cuda/reduction-operations_impl.cu new file mode 100644 index 0000000000000000000000000000000000000000..b86aa68fcb6b246962d6c8a2c6a24f2fe861b0ec --- /dev/null +++ b/src/implementation/core/cuda/reduction-operations_impl.cu @@ -0,0 +1,16 @@ +/*************************************************************************** + reduction-operations.cu - description + ------------------- + begin : Mar 22, 2013 + copyright : (C) 2013 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ \ No newline at end of file diff --git a/src/implementation/core/cuda/reduction-operations_impl.h b/src/implementation/core/cuda/reduction-operations_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..113b837a5909e9f834ed08a8dbcd6d9c8027ab93 --- /dev/null +++ b/src/implementation/core/cuda/reduction-operations_impl.h @@ -0,0 +1,23 @@ +/*************************************************************************** + reduction-operations_impl.h - description + ------------------- + begin : Mar 22, 2013 + copyright : (C) 2013 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#ifndef REDUCTION_OPERATIONS_IMPL_H_ +#define REDUCTION_OPERATIONS_IMPL_H_ + + + +#endif /* REDUCTION_OPERATIONS_IMPL_H_ */ diff --git a/src/implementation/core/memory-operations.h b/src/implementation/core/memory-operations.h index dae87f9fa932883f76ff1d9191befc580c2e6cb7..6a645b7560310df4039764860ec946b7ac641faf 100644 --- a/src/implementation/core/memory-operations.h +++ b/src/implementation/core/memory-operations.h @@ -25,20 +25,23 @@ const int tnlGPUvsCPUTransferBufferSize( 1 << 20 ); template< typename Element, typename Index > -void allocateMemoryHost( Element*& data, +bool allocateMemoryHost( Element*& data, const Index size ) { - data = new Element[ size ]; + if( ! ( data = new Element[ size ] ) ) + return false; + return true; } template< typename Element, typename Index > -void allocateMemoryCuda( Element*& data, +bool allocateMemoryCuda( Element*& data, const Index size ) { #ifdef HAVE_CUDA if( cudaMalloc( ( void** ) &data, ( size_t ) size * sizeof( Element ) ) != cudaSuccess ) data = 0; + return checkCudaDevice; #endif } @@ -54,7 +57,7 @@ bool freeMemoryCuda( Element* data ) { #ifdef HAVE_CUDA cudaFree( data ); - checkCudaDevice; + return checkCudaDevice; #endif return true; } @@ -95,11 +98,11 @@ bool setMemoryCuda( Element* data, { #ifdef HAVE_CUDA dim3 blockSize, gridSize; - blockSize. x = 32; - int blocksNumber = ceil( ( double ) size / ( double ) blockSize. x ); - int elementsPerThread = ceil( ( double ) blocksNumber / ( double ) maxCudaGridSize ); + blockSize. x = 256; + Index blocksNumber = ceil( ( double ) size / ( double ) blockSize. x ); + Index elementsPerThread = ceil( ( double ) blocksNumber / ( double ) maxCudaGridSize ); gridSize. x = Min( blocksNumber, maxCudaGridSize ); - cout << "blocksNumber = " << blocksNumber << "Grid size = " << gridSize. x << " elementsPerThread = " << elementsPerThread << endl; + //cout << "blocksNumber = " << blocksNumber << "Grid size = " << gridSize. x << " elementsPerThread = " << elementsPerThread << endl; setVectorValueCudaKernel<<< blockSize, gridSize >>>( data, size, value, elementsPerThread ); return checkCudaDevice; @@ -148,15 +151,11 @@ bool copyMemoryCudaToHost( Element* destination, const Index size ) { #ifdef HAVE_CUDA - if( cudaMemcpy( destination, - source, - size * sizeof( Element ), - cudaMemcpyDeviceToHost ) != cudaSuccess ) - { - cerr << "Transfer of data from CUDA device to host failed." << endl; - return false; - } - return true; + cudaMemcpy( destination, + source, + size * sizeof( Element ), + cudaMemcpyDeviceToHost ); + return checkCudaDevice; #else cerr << "CUDA support is missing in this system." << endl; return false; @@ -173,11 +172,7 @@ bool copyMemoryCudaToCuda( Element* destination, source, size * sizeof( Element ), cudaMemcpyDeviceToDevice ) != cudaSuccess ) - { - cerr << "Transfer of data from CUDA device to device failed." << endl; - return false; - } - return true; + return checkCudaDevice; #else cerr << "CUDA support is missing in this system." << endl; return false; diff --git a/tests/unit-tests/CMakeLists.txt b/tests/unit-tests/CMakeLists.txt index eeef543f8dbfc95a8325992c9f1c574d14c5cee3..367fd50bd59e964e8f63c564274fc9bafffdf4ef 100755 --- a/tests/unit-tests/CMakeLists.txt +++ b/tests/unit-tests/CMakeLists.txt @@ -36,6 +36,8 @@ if( BUILD_CUDA ) ADD_TEST( core/cuda/tnl-device-check-test${mpiExt}${debugExt} ${EXECUTABLE_OUTPUT_PATH}/tnl-device-check-test${mpiExt}${debugExt} ) ADD_TEST( core/cuda/tnl-memory-operations-test${mpiExt}${debugExt} ${EXECUTABLE_OUTPUT_PATH}/tnl-memory-operations-test${mpiExt}${debugExt} ) SET_TESTS_PROPERTIES ( core/cuda/tnl-memory-operations-test${mpiExt}${debugExt} PROPERTIES DEPENDS core/cuda/tnl-device-check-test${mpiExt}${debugExt} ) + ADD_TEST( core/cuda/tnl-reduction-test${mpiExt}${debugExt} ${EXECUTABLE_OUTPUT_PATH}/tnl-reduction-test${mpiExt}${debugExt} ) + SET_TESTS_PROPERTIES ( core/cuda/tnl-reduction-test${mpiExt}${debugExt} PROPERTIES DEPENDS core/cuda/tnl-memory-operations-test${mpiExt}${debugExt} ) endif() ADD_TEST( tnl-unit-tests${mpiExt}${debugExt} ${EXECUTABLE_OUTPUT_PATH}/tnl-unit-tests${mpiExt}${debugExt} ) diff --git a/tests/unit-tests/core/cuda/CMakeLists.txt b/tests/unit-tests/core/cuda/CMakeLists.txt index 73f852b9c516d2e23fd8b1c053118f630d459e7b..c52a01d7c043cab8cb01e64ff0f3052daef7203a 100755 --- a/tests/unit-tests/core/cuda/CMakeLists.txt +++ b/tests/unit-tests/core/cuda/CMakeLists.txt @@ -1,5 +1,6 @@ set( headers tnlCudaDeviceCheckTester.h - tnlCudaMemoryOperationsTester.h ) + tnlCudaMemoryOperationsTester.h + tnlCudaReductionTester.h ) if( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( tnl-device-check-test${mpiExt}${debugExt} ${headers} device-check-test.cu ) @@ -10,11 +11,10 @@ if( BUILD_CUDA ) TARGET_LINK_LIBRARIES( tnl-memory-operations-test${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES} tnl${mpiExt}${debugExt}-0.1 ) -# CUDA_ADD_EXECUTABLE( tnl-reduction-test${mpiExt}${debugExt} ${headers} reduction-test.cu ) -# TARGET_LINK_LIBRARIES( tnl-reduction-test${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES} -# tnl${mpiExt}${debugExt}-0.1 ) - - + CUDA_ADD_EXECUTABLE( tnl-reduction-test${mpiExt}${debugExt} ${headers} reduction-test.cu ) + TARGET_LINK_LIBRARIES( tnl-reduction-test${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES} + tnl${mpiExt}${debugExt}-0.1 ) + endif() diff --git a/tests/unit-tests/core/cuda/reduction-test.cu b/tests/unit-tests/core/cuda/reduction-test.cu index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e980cf5b9cb70bf44319b159716245be79fa974f 100644 --- a/tests/unit-tests/core/cuda/reduction-test.cu +++ b/tests/unit-tests/core/cuda/reduction-test.cu @@ -0,0 +1,34 @@ +/*************************************************************************** + reduction-test.cu - description + ------------------- + begin : Mar 20, 2013 + copyright : (C) 2013 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#include <tnlConfig.h> + +#ifdef HAVE_CPPUNIT + +#include <cppunit/ui/text/TestRunner.h> +#include "tnlCudaReductionTester.h" + +int main( int argc, char* argv[] ) +{ + CppUnit :: TextTestRunner runner; + runner. addTest( tnlCudaReductionTester :: suite() ); + if( ! runner.run() ) + return EXIT_FAILURE; + return EXIT_SUCCESS; +} + +#endif diff --git a/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h b/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h index b65a8091a588a1367e3955d60bd5c0814e597888..21183a9d2ea941c5c39a424a7d2027397a66e070 100644 --- a/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h +++ b/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h @@ -51,10 +51,10 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase "smallMemorySetTest", &tnlCudaMemoryOperationsTester :: smallMemorySetTest ) ); - suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >( + /*suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >( "bigMemorySetTest", &tnlCudaMemoryOperationsTester :: bigMemorySetTest ) - ); + );*/ return suiteOfTests; }; @@ -71,7 +71,7 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase void copyTest() { - const int size( 1 << 20 ); + const int size( 1 << 22 ); int *hostData1, *hostData2, *deviceData; allocateMemoryHost( hostData1, size ); allocateMemoryHost( hostData2, size ); diff --git a/tests/unit-tests/core/cuda/tnlCudaReductionTester.h b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h new file mode 100644 index 0000000000000000000000000000000000000000..8a0519885c4b7e1bc7b94e21b0f71b5cc573de5e --- /dev/null +++ b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h @@ -0,0 +1,122 @@ +/*************************************************************************** + tnlCudaReductionTester.h - description + ------------------- + begin : Mar 22, 2013 + copyright : (C) 2013 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + + +#ifndef TNLCUDAREDUCTIONTESTER_H_ +#define TNLCUDAREDUCTIONTESTER_H_ + +#include <cppunit/TestSuite.h> +#include <cppunit/TestResult.h> +#include <cppunit/TestCaller.h> +#include <cppunit/TestCase.h> +#include <cppunit/Message.h> +#include <core/cuda/device-check.h> +#include <implementation/core/cuda-long-vector-kernels.h> + +class tnlCudaReductionTester : public CppUnit :: TestCase +{ + public: + tnlCudaReductionTester(){}; + + virtual + ~tnlCudaReductionTester(){}; + + static CppUnit :: Test* suite() + { + CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCudaReductionTester" ); + CppUnit :: TestResult result; + + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( + "constantSequenceTest", + &tnlCudaReductionTester :: constantSequenceTest< float > ) + ); + + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( + "linearSequenceTest", + &tnlCudaReductionTester :: linearSequenceTest< float > ) + ); + + + return suiteOfTests; + } + + template< typename RealType > + void constantSequenceTest() + { + const int size( 1024 ); + RealType *hostData, *deviceData; + allocateMemoryHost( hostData, size ); + allocateMemoryCuda( deviceData, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + for( int i = 0; i < size; i ++ ) + hostData[ i ] = 1; + copyMemoryHostToCuda( deviceData, hostData, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + RealType result; + tnlParallelReductionSum< RealType, int > sumOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( sumOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == size ); + + tnlParallelReductionMin< RealType, int > minOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( minOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 1 ); + + + freeMemoryHost( hostData ); + freeMemoryCuda( deviceData ); + CPPUNIT_ASSERT( checkCudaDevice ); + }; + + template< typename RealType > + void linearSequenceTest() + { + const int size( 1024 ); + RealType *hostData, *deviceData; + allocateMemoryHost( hostData, size ); + allocateMemoryCuda( deviceData, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + RealType sum( 0.0 ); + for( int i = 0; i < size; i ++ ) + { + hostData[ i ] = i + 1; + sum += hostData[ i ]; + } + copyMemoryHostToCuda( deviceData, hostData, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + tnlParallelReductionSum< RealType, int > sumOperation; + RealType result; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( sumOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == sum ); + tnlParallelReductionMin< RealType, int > minOperation; + CPPUNIT_ASSERT( + ( tnlCUDALongVectorReduction( minOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 1 ); + + + freeMemoryHost( hostData ); + freeMemoryCuda( deviceData ); + CPPUNIT_ASSERT( checkCudaDevice ); + }; + +}; + + +#endif /* TNLCUDAREDUCTIONTESTER_H_ */