diff --git a/src/core/cuda/cuda-prefix-sum_impl.h b/src/core/cuda/cuda-prefix-sum_impl.h index 75d86ece9e0742f4de062b08deacd85e809cb3c6..ad2d02aa3341c21056b9e2994df93918aa38db52 100644 --- a/src/core/cuda/cuda-prefix-sum_impl.h +++ b/src/core/cuda/cuda-prefix-sum_impl.h @@ -86,8 +86,8 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) { - operation.performInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ], - sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); + operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ], + sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); auxData[ threadIdx. x ] = sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ]; chunkPointer ++; @@ -100,7 +100,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT const int warpIdx = threadIdx. x / tnlCuda::getWarpSize(); for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks ) - operation.performInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] ); + operation.commonReductionOnDevice( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] ); if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 ) warpSums[ warpIdx ] = auxData[ threadIdx. x ]; @@ -112,14 +112,14 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( warpIdx == 0 ) for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride ) - operation.performInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] ); + operation.commonReductionOnDevice( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] ); __syncthreads(); /**** * Shift the warp prefix-sums. */ if( warpIdx > 0 ) - operation.performInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] ); + operation.commonReductionOnDevice( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] ); /*** * Store the result back in global memory. @@ -132,7 +132,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT DataType chunkShift( operation.initialValue() ); if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; - operation.performInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift ); + operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ]; idx += blockDim. x; } @@ -141,10 +141,15 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( threadIdx. x == 0 ) { if( prefixSumType == exclusivePrefixSum ) - auxArray[ blockIdx. x ] = - operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ), - tnlCuda::getInterleaving( lastElementInBlock ), - sharedData ); + { + /*auxArray[ blockIdx. x ] = operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ), + tnlCuda::getInterleaving( lastElementInBlock ), + sharedData );*/ + DataType aux = operation.initialValue(); + operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ] ); + operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] ); + auxArray[ blockIdx. x ] = aux; + } else auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ]; } diff --git a/src/core/cuda/cuda-reduction_impl.h b/src/core/cuda/cuda-reduction_impl.h index 5cc93fd8add91780acedc5eb079fa33733ffe226..434131239ca8507ea0deca3dca6392aca9225b73 100644 --- a/src/core/cuda/cuda-reduction_impl.h +++ b/src/core/cuda/cuda-reduction_impl.h @@ -37,27 +37,25 @@ using namespace std; - /**** * Arrays smaller than the following constant * are reduced on CPU. The constant must not be larger * than maximal CUDA grid size. */ -const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256; +const int minGPUReductionDataSize = 128;//65536; //16384;//1024;//256; static tnlCudaReductionBuffer cudaReductionBuffer( 8 * minGPUReductionDataSize ); #ifdef HAVE_CUDA - -template< typename Operation, int blockSize, bool isSizePow2 > +template< typename Operation, int blockSize > __global__ void tnlCUDAReductionKernel( const Operation operation, const typename Operation :: IndexType size, const typename Operation :: RealType* input1, const typename Operation :: RealType* input2, typename Operation :: ResultType* output ) { - typedef tnlCUDAReduction< Operation, blockSize, isSizePow2 > Reduction; + typedef tnlCUDAReduction< Operation, blockSize > Reduction; Reduction::reduce( operation, size, input1, input2, output ); }; @@ -73,124 +71,61 @@ typename Operation::IndexType reduceOnCudaDevice( const Operation& operation, typedef typename Operation :: ResultType ResultType; const IndexType desGridSize( minGPUReductionDataSize ); - dim3 blockSize( 256 ), gridSize( 0 ); - + dim3 blockSize( 256 ), gridSize( 0 ); gridSize. x = Min( tnlCuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); - - /*#ifdef CUDA_REDUCTION_PROFILING - tnlTimerRT timer; - timer.reset(); - timer.start(); - #endif */ if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) ) return false; - output = cudaReductionBuffer.template getData< ResultType >(); - - + output = cudaReductionBuffer.template getData< ResultType >(); IndexType shmem = blockSize.x * sizeof( ResultType ); + /*** * Depending on the blockSize we generate appropriate template instance. */ - - if( isPow2( size ) ) - { - switch( blockSize.x ) - { - case 512: - tnlCUDAReductionKernel< Operation, 512, true > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 256: - tnlCUDAReductionKernel< Operation, 256, true > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 128: - tnlCUDAReductionKernel< Operation, 128, true > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 64: - tnlCUDAReductionKernel< Operation, 64, true > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 32: - tnlCUDAReductionKernel< Operation, 32, true > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 16: - tnlCUDAReductionKernel< Operation, 16, true > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 8: - tnlCUDAReductionKernel< Operation, 8, true > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 4: - tnlCUDAReductionKernel< Operation, 4, true > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 2: - tnlCUDAReductionKernel< Operation, 2, true > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 1: - tnlAssert( false, cerr << "blockSize should not be 1." << endl ); - default: - tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); - } - } - else + switch( blockSize.x ) { - switch( blockSize.x ) - { - case 512: - tnlCUDAReductionKernel< Operation, 512, false > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 256: - tnlCUDAReductionKernel< Operation, 256, false > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 128: - tnlCUDAReductionKernel< Operation, 128, false > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 64: - tnlCUDAReductionKernel< Operation, 64, false > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 32: - tnlCUDAReductionKernel< Operation, 32, false > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 16: - tnlCUDAReductionKernel< Operation, 16, false > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 8: - tnlCUDAReductionKernel< Operation, 8, false > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 4: - tnlCUDAReductionKernel< Operation, 4, false > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 2: - tnlCUDAReductionKernel< Operation, 2, false > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 1: - tnlAssert( false, cerr << "blockSize should not be 1." << endl ); - default: - tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); - } + case 512: + tnlCUDAReductionKernel< Operation, 512 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 256: + tnlCUDAReductionKernel< Operation, 256 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 128: + tnlCUDAReductionKernel< Operation, 128 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 64: + tnlCUDAReductionKernel< Operation, 64 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 32: + tnlCUDAReductionKernel< Operation, 32 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 16: + tnlCUDAReductionKernel< Operation, 16 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 8: + tnlCUDAReductionKernel< Operation, 8 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 4: + tnlCUDAReductionKernel< Operation, 4 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 2: + tnlCUDAReductionKernel< Operation, 2 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 1: + tnlAssert( false, cerr << "blockSize should not be 1." << endl ); + default: + tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); } //checkCudaDevice; - /*#ifdef CUDA_REDUCTION_PROFILING - //cudaThreadSynchronize(); - timer.stop(); - cout << " Main reduction on GPU took " << timer.getTime() << " sec. " << endl; - #endif */ return gridSize. x; } #endif @@ -222,62 +157,64 @@ bool reductionOnCudaDevice( const Operation& operation, if( deviceInput2 && ! tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, size ) ) return false; - result = operation. initialValueOnHost( 0, hostArray1, hostArray2 ); - for( IndexType i = 1; i < size; i ++ ) - result = operation. reduceOnHost( i, result, hostArray1, hostArray2 ); + result = operation.initialValue(); + for( IndexType i = 0; i < size; i ++ ) + result = operation.reduceOnHost( i, result, hostArray1, hostArray2 ); return true; } + #ifdef CUDA_REDUCTION_PROFILING + tnlTimerRT timer; + timer.reset(); + timer.start(); + #endif + /**** * Reduce the data on the CUDA device. - */ -#ifdef CUDA_REDUCTION_PROFILING - tnlTimerRT timer; - timer.reset(); - timer.start(); -#endif + */ ResultType* deviceAux1( 0 ); IndexType reducedSize = reduceOnCudaDevice( operation, size, deviceInput1, deviceInput2, deviceAux1 ); -#ifdef CUDA_REDUCTION_PROFILING - timer.stop(); - cout << " Reduction on GPU to size " << reducedSize << " took " << timer.getTime() << " sec. " << endl; -#endif + #ifdef CUDA_REDUCTION_PROFILING + timer.stop(); + cout << " Reduction on GPU to size " << reducedSize << " took " << timer.getTime() << " sec. " << endl; + timer.reset(); + timer.start(); + #endif /*** * Transfer the reduced data from device to host. */ -#ifdef CUDA_REDUCTION_PROFILING - timer.reset(); - timer.start(); -#endif ResultType resultArray[ minGPUReductionDataSize ]; if( ! tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) ) return false; -#ifdef CUDA_REDUCTION_PROFILING - timer.stop(); - cout << " Transferring data to CPU took " << timer.getTime() << " sec. " << endl; -#endif + + #ifdef CUDA_REDUCTION_PROFILING + timer.stop(); + cout << " Transferring data to CPU took " << timer.getTime() << " sec. " << endl; + #endif + #ifdef CUDA_REDUCTION_PROFILING + timer.reset(); + timer.start(); + #endif + /*** * Reduce the data on the host system. - */ + */ LaterReductionOperation laterReductionOperation; -#ifdef CUDA_REDUCTION_PROFILING - timer.reset(); - timer.start(); -#endif - result = laterReductionOperation. initialValueOnHost( 0, resultArray, ( ResultType* ) 0 ); - for( IndexType i = 1; i < reducedSize; i ++ ) - result = laterReductionOperation. reduceOnHost( i, result, resultArray, ( ResultType*) 0 ); -#ifdef CUDA_REDUCTION_PROFILING - cudaThreadSynchronize(); - timer.stop(); - cout << " Reduction of small data set on CPU took " << timer.getTime() << " sec. " << endl; -#endif + result = laterReductionOperation. initialValue(); + for( IndexType i = 0; i < reducedSize; i ++ ) + result = laterReductionOperation.reduceOnHost( i, result, resultArray, ( ResultType*) 0 ); + + #ifdef CUDA_REDUCTION_PROFILING + timer.stop(); + cout << " Reduction of small data set on CPU took " << timer.getTime() << " sec. " << endl; + #endif + return checkCudaDevice; #else tnlCudaSupportMissingMessage;; diff --git a/src/core/cuda/reduction-operations.h b/src/core/cuda/reduction-operations.h index 04566a8d8627dae8dc767630439da5830292bd9a..f804847f2e8ba12be96f82a965dc6a98eb1df237 100644 --- a/src/core/cuda/reduction-operations.h +++ b/src/core/cuda/reduction-operations.h @@ -200,13 +200,6 @@ class tnlParallelReductionSum typedef Real ResultType; typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx ]; - }; - ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, @@ -226,46 +219,6 @@ class tnlParallelReductionSum } #ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx1 ] + data1[ idx2 ]; - }; - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx1 ]; - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ]; - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + data2[ idx2 ]; - }; - - __device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - return data[ idx1 ] + data[ idx2 ]; - }; __device__ ResultType commonReductionOnDevice( ResultType& result, const ResultType& data ) const @@ -279,13 +232,6 @@ class tnlParallelReductionSum result += data; }; - - __device__ void performInPlace( ResultType& a, - const ResultType& b ) const - { - a += b; - } - #endif }; @@ -299,13 +245,6 @@ class tnlParallelReductionMin typedef Real ResultType; typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx ]; - }; - ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, @@ -324,41 +263,7 @@ class tnlParallelReductionMin result = tnlCudaMin( result, data1[ index ] ); } -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaMin( data1[ idx1 ], data1[ idx2 ] ); - } - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx1 ]; - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMin( data1[ idx1 ], tnlCudaMin( data2[ idx2 ], data2[ idx3 ] ) ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] ); - }; - +#ifdef HAVE_CUDA __device__ ResultType commonReductionOnDevice( ResultType& result, const ResultType& data ) const { @@ -385,13 +290,6 @@ class tnlParallelReductionMax typedef Real ResultType; typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx ]; - }; - ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, @@ -410,48 +308,7 @@ class tnlParallelReductionMax result = tnlCudaMax( result, data1[ index ] ); } -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaMax( data1[ idx1 ], data1[ idx2 ] ); - } - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx1 ]; - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMax( data1[ idx1 ], tnlCudaMax( data2[ idx2 ], data2[ idx3 ] ) ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] ); - }; - - __device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - return tnlCudaMax( data[ idx1 ], data[ idx2 ] ); - }; - +#ifdef HAVE_CUDA __device__ ResultType commonReductionOnDevice( ResultType& result, const ResultType& data ) const { @@ -467,216 +324,112 @@ class tnlParallelReductionMax }; template< typename Real, typename Index > -class tnlParallelReductionAbsSum : public tnlParallelReductionSum< Real, Index > +class tnlParallelReductionLogicalAnd { public: typedef Real RealType; typedef Index IndexType; typedef Real ResultType; - typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation; - - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return tnlAbs( data1[ idx ] ); - }; + typedef tnlParallelReductionLogicalAnd< Real, Index > LaterReductionOperation; ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, const RealType* data2 ) const { - return current + tnlAbs( data1[ idx ] ); + return current && data1[ idx ]; }; - __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; }; - + __cuda_callable__ ResultType initialValue() const { return ( ResultType ) true; }; + __cuda_callable__ void cudaFirstReduction( ResultType& result, const IndexType index, const RealType* data1, const RealType* data2 ) const { - result += tnlCudaAbs( data1[ index ] ); + result = result && data1[ index ]; } -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaAbs( data1[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] ); - }; - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaAbs( data1[ idx1 ] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ) + tnlCudaAbs( data2[ idx3 ] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ); - }; - - /*__device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const +#ifdef HAVE_CUDA + __device__ ResultType commonReductionOnDevice( ResultType& result, + const ResultType& data ) const { - return data[ idx1 ] + data[ idx2 ]; + result = result && data; }; - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const + __device__ ResultType commonReductionOnDevice( volatile ResultType& result, + volatile const ResultType& data ) const { - return data1 + data2; + result = result && data; }; - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return data1 + data2; - };*/ - - + #endif }; + template< typename Real, typename Index > -class tnlParallelReductionAbsMin : public tnlParallelReductionMin< Real, Index > +class tnlParallelReductionLogicalOr { public: typedef Real RealType; typedef Index IndexType; typedef Real ResultType; - typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation; - - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return tnlAbs( data1[ idx ] ); - }; + typedef tnlParallelReductionLogicalOr< Real, Index > LaterReductionOperation; ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, const RealType* data2 ) const { - return Min( current, tnlAbs( data1[ idx ] ) ); + return current || data1[ idx ]; }; - - __cuda_callable__ ResultType initialValue() const { return tnlMaxValue< ResultType>(); }; + + __cuda_callable__ ResultType initialValue() const { return ( ResultType ) false; }; __cuda_callable__ void cudaFirstReduction( ResultType& result, const IndexType index, const RealType* data1, const RealType* data2 ) const { - result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] ) ); - } - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) ); + result = result || data1[ index ]; } - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaAbs( data1[ idx1 ] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMin( data1[ idx1 ], tnlCudaMin( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMin( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) ); - }; - /*__device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const +#ifdef HAVE_CUDA + __device__ ResultType commonReductionOnDevice( ResultType& result, + const ResultType& data ) const { - volatile ResultType aux = tnlCudaAbs( data[ idx2 ] ); - return tnlCudaMin( data[ idx1 ], aux ); + result = result || data; }; - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const + __device__ ResultType commonReductionOnDevice( volatile ResultType& result, + volatile const ResultType& data ) const { - return tnlCudaMin( data1, data2 ); + result = result || data; }; - - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return tnlCudaMin( data1, data2 ); - };*/ - - #endif }; template< typename Real, typename Index > -class tnlParallelReductionAbsMax : public tnlParallelReductionMax< Real, Index > +class tnlParallelReductionAbsSum : public tnlParallelReductionSum< Real, Index > { public: typedef Real RealType; typedef Index IndexType; typedef Real ResultType; - typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation; - - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return tnlAbs( data1[ idx ] ); - }; + typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation; ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, const RealType* data2 ) const { - return Max( current, tnlAbs( data1[ idx ] ) ); + return current + tnlAbs( data1[ idx ] ); }; __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; }; @@ -686,179 +439,82 @@ class tnlParallelReductionAbsMax : public tnlParallelReductionMax< Real, Index > const RealType* data1, const RealType* data2 ) const { - result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] ) ); - } - - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) ); + result += tnlCudaAbs( data1[ index ] ); } - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaAbs( data1[ idx1 ] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMax( data1[ idx1 ], tnlCudaMax( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMax( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) ); - }; - - /*__device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - volatile ResultType aux = tnlCudaAbs( data[ idx2 ] ); - return tnlCudaMax( data[ idx1 ], aux ); - }; - - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const - { - return tnlCudaMax( data1, data2 ); - }; - - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return tnlCudaMax( data1, data2 ); - };*/ - - -#endif }; template< typename Real, typename Index > -class tnlParallelReductionLogicalAnd +class tnlParallelReductionAbsMin : public tnlParallelReductionMin< Real, Index > { public: typedef Real RealType; typedef Index IndexType; typedef Real ResultType; - typedef tnlParallelReductionLogicalAnd< Real, Index > LaterReductionOperation; - - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx ]; - }; + typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation; ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, const RealType* data2 ) const { - return current && data1[ idx ]; + return Min( current, tnlAbs( data1[ idx ] ) ); }; - __cuda_callable__ ResultType initialValue() const { return ( ResultType ) true; }; + __cuda_callable__ ResultType initialValue() const { return tnlMaxValue< ResultType>(); }; __cuda_callable__ void cudaFirstReduction( ResultType& result, const IndexType index, const RealType* data1, const RealType* data2 ) const { - result = result && data1[ index ]; - } - - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx1 ] && data1[ idx2 ]; - } + result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] ) ); + } +}; - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx1 ]; - }; +template< typename Real, typename Index > +class tnlParallelReductionAbsMax : public tnlParallelReductionMax< Real, Index > +{ + public: - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] && data2[ idx2 ] && data2[ idx3 ]; - }; + typedef Real RealType; + typedef Index IndexType; + typedef Real ResultType; + typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation; - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const + ResultType reduceOnHost( const IndexType idx, + const ResultType& current, + const RealType* data1, + const RealType* data2 ) const { - return data1[ idx1 ] && data2[ idx2 ]; + return Max( current, tnlAbs( data1[ idx ] ) ); }; - __device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - return data[ idx1 ] && data[ idx2 ]; - }; - - __device__ ResultType commonReductionOnDevice( ResultType& result, - const ResultType& data ) const - { - result = result && data; - }; - - __device__ ResultType commonReductionOnDevice( volatile ResultType& result, - volatile const ResultType& data ) const - { - result = result && data; - }; - + __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; }; -#endif + __cuda_callable__ void cudaFirstReduction( ResultType& result, + const IndexType index, + const RealType* data1, + const RealType* data2 ) const + { + result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] ) ); + } }; template< typename Real, typename Index > -class tnlParallelReductionLogicalOr +class tnlParallelReductionLpNorm : public tnlParallelReductionSum< Real, Index > { public: typedef Real RealType; typedef Index IndexType; typedef Real ResultType; - typedef tnlParallelReductionLogicalOr< Real, Index > LaterReductionOperation; + typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const + void setPower( const RealType& p ) { - return data1[ idx ]; + this -> p = p; }; ResultType reduceOnHost( const IndexType idx, @@ -866,200 +522,34 @@ class tnlParallelReductionLogicalOr const RealType* data1, const RealType* data2 ) const { - return current || data1[ idx ]; + return current + pow( tnlAbs( data1[ idx ] ), p ); }; - - __cuda_callable__ ResultType initialValue() const { return ( ResultType ) false; }; + + __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; }; __cuda_callable__ void cudaFirstReduction( ResultType& result, const IndexType index, const RealType* data1, const RealType* data2 ) const { - result = result || data1[ index ]; + result += tnlCudaPow( tnlCudaAbs( data1[ index ] ), p ); } + + protected: + RealType p; +}; -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx1 ] || data1[ idx2 ]; - } - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx1 ]; - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] || data2[ idx2 ] || data2[ idx3 ]; - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] || data2[ idx2 ]; - }; - - __device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - return data[ idx1 ] || data[ idx2 ]; - }; - - __device__ ResultType commonReductionOnDevice( ResultType& result, - const ResultType& data ) const - { - result = result || data; - }; - - __device__ ResultType commonReductionOnDevice( volatile ResultType& result, - volatile const ResultType& data ) const - { - result = result || data; - }; - - -#endif -}; - -template< typename Real, typename Index > -class tnlParallelReductionLpNorm : public tnlParallelReductionSum< Real, Index > -{ - public: - - typedef Real RealType; - typedef Index IndexType; - typedef Real ResultType; - typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation; - - void setPower( const RealType& p ) - { - this -> p = p; - }; - - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return pow( tnlAbs( data1[ idx ] ), p ); - }; - - ResultType reduceOnHost( const IndexType idx, - const ResultType& current, - const RealType* data1, - const RealType* data2 ) const - { - return current + pow( tnlAbs( data1[ idx ] ), p ); - }; - - __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; }; - - __cuda_callable__ void cudaFirstReduction( ResultType& result, - const IndexType index, - const RealType* data1, - const RealType* data2 ) const - { - result += tnlCudaPow( tnlCudaAbs( data1[ index ] ), p ); - } - - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] ), p ) + tnlCudaPow( tnlCudaAbs( data1[ idx2 ] ), p ); - } - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] ), p ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + - tnlCudaPow( tnlCudaAbs( data2[ idx2 ] ), p ) + - tnlCudaPow( tnlCudaAbs( data2[ idx3 ] ), p ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + tnlCudaPow( tnlCudaAbs( data2[ idx2 ] ), p ); - }; - - /*__device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - return data[ idx1 ] + data[ idx2 ]; - }; - - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const - { - return data1 + data2; - }; - - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return data1 + data2; - };*/ - - -#endif - - protected: - - RealType p; -}; - -template< typename Real, typename Index > -class tnlParallelReductionEqualities : public tnlParallelReductionLogicalAnd< bool, Index > -{ - public: +template< typename Real, typename Index > +class tnlParallelReductionEqualities : public tnlParallelReductionLogicalAnd< bool, Index > +{ + public: typedef Real RealType; typedef Index IndexType; typedef bool ResultType; typedef tnlParallelReductionLogicalAnd< bool, Index > LaterReductionOperation; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return ( data1[ idx ] == data2[ idx ] ); - }; - ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, @@ -1077,66 +567,6 @@ class tnlParallelReductionEqualities : public tnlParallelReductionLogicalAnd< bo { result = result && ( data1[ index ] == data2[ index ] ); } - - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return ( data1[ idx1 ] == data2[ idx1 ] ) && ( data1[ idx2 ] == data2[ idx2] ); - } - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return ( data1[ idx1 ]== data2[ idx1 ] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] && - ( data2[ idx2 ] == data2[ idx2] ) && - ( data2[ idx3 ] == data3[ idx3] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] && ( data2[ idx2 ] == data3[ idx2 ] ); - }; - - /* __device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - return data[ idx1 ] && data[ idx2 ]; - }; - - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const - { - return data1 && data2; - }; - - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return data1 && data2; - };*/ - - -#endif }; template< typename Real, typename Index > @@ -1149,13 +579,6 @@ class tnlParallelReductionInequalities : public tnlParallelReductionLogicalAnd< typedef bool ResultType; typedef tnlParallelReductionLogicalAnd< bool, Index > LaterReductionOperation; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return ( data1[ idx ] != data2[ idx ] ); - }; - ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, @@ -1173,66 +596,6 @@ class tnlParallelReductionInequalities : public tnlParallelReductionLogicalAnd< { result = result && ( data1[ index ] != data2[ index ] ); } - - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return ( data1[ idx1 ] != data2[ idx1 ] ) && ( data1[ idx2 ] != data2[ idx2] ); - } - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return ( data1[ idx1 ] != data2[ idx1 ] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] && - ( data2[ idx2 ] != data2[ idx2] ) && - ( data2[ idx3 ] != data3[ idx3] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] && ( data2[ idx2 ] != data3[ idx2 ] ); - }; - - /*__device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - return data[ idx1 ] && data[ idx2 ]; - }; - - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const - { - return data1 && data2; - }; - - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return data1 && data2; - };*/ - - -#endif }; template< typename Real, typename Index > @@ -1245,13 +608,6 @@ class tnlParallelReductionScalarProduct : public tnlParallelReductionSum< Real, typedef Real ResultType; typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx ] * data2[ idx ]; - }; - ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, @@ -1268,66 +624,7 @@ class tnlParallelReductionScalarProduct : public tnlParallelReductionSum< Real, const RealType* data2 ) const { result += data1[ index ] * data2[ index ]; - } - - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return ( data1[ idx1 ] * data2[ idx1 ] ) + ( data1[ idx2 ] * data2[ idx2] ); - } - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return ( data1[ idx1 ] * data2[ idx1 ] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + - ( data2[ idx2 ] * data3[ idx2] ) + - ( data2[ idx3 ] * data3[ idx3] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + ( data2[ idx2 ] * data3[ idx2 ] ); - }; - - /*__device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - return data[ idx1 ] + data[ idx2 ]; - }; - - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const - { - return data1 + data2; - }; - - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return data1 + data2; - };*/ - -#endif + } }; template< typename Real, typename Index > @@ -1340,13 +637,6 @@ class tnlParallelReductionDiffSum : public tnlParallelReductionSum< Real, Index typedef Real ResultType; typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx ] - data2[ idx ]; - }; - ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, @@ -1363,67 +653,7 @@ class tnlParallelReductionDiffSum : public tnlParallelReductionSum< Real, Index const RealType* data2 ) const { result += data1[ index ] - data2[ index ]; - } - - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return ( data1[ idx1 ] - data2[ idx1 ] ) + ( data1[ idx2 ] - data2[ idx2 ] ); - }; - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx1 ] - data2[ idx1 ]; - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + - ( data2[ idx2 ] - data3[ idx2 ] ) + - ( data2[ idx3 ] - data3[ idx3 ] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + ( data2[ idx2 ] - data3[ idx2 ] ); - }; - - /*__device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - return data[ idx1 ] + data[ idx2 ]; - }; - - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const - { - return data1 + data2; - }; - - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return data1 + data2; - };*/ - - -#endif + } }; template< typename Real, typename Index > @@ -1436,13 +666,6 @@ class tnlParallelReductionDiffMin : public tnlParallelReductionMin< Real, Index typedef Real ResultType; typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx ] - data2[ idx ]; - }; - ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, @@ -1460,66 +683,6 @@ class tnlParallelReductionDiffMin : public tnlParallelReductionMin< Real, Index { result = tnlCudaMin( result, data1[ index ] - data2[ index ] ); } - - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaMin( data1[ idx1 ] - data2[ idx1 ], data1[ idx2 ] - data2[ idx2 ] ); - } - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx1 ] - data2[ idx1 ]; - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMin( data1[ idx1 ], - tnlCudaMin( data2[ idx2 ] - data3[ idx2 ], - data2[ idx3 ] - data3[ idx3 ] ) ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] - data3[ idx2 ] ); - }; - - /* __device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - return tnlCudaMin( data[ idx1 ], data[ idx2 ] ); - }; - - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const - { - return tnlCudaMin( data1, data2 ); - }; - - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return tnlCudaMin( data1, data2 ); - };*/ - - -#endif }; template< typename Real, typename Index > @@ -1532,13 +695,6 @@ class tnlParallelReductionDiffMax : public tnlParallelReductionMax< Real, Index typedef Real ResultType; typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx ] - data2[ idx ]; - }; - ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, @@ -1556,67 +712,6 @@ class tnlParallelReductionDiffMax : public tnlParallelReductionMax< Real, Index { result = tnlCudaMax( result, data1[ index ] - data2[ index ] ); } - - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaMax( data1[ idx1 ] - data2[ idx1 ], - data1[ idx2 ] - data2[ idx2 ] ); - } - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return data1[ idx1 ] - data2[ idx1 ]; - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMax( data1[ idx1 ], - tnlCudaMax( data2[ idx2 ] - data3[ idx2 ], - data2[ idx3 ] - data3[ idx3 ] ) ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] - data3[ idx2 ] ); - }; - - /* __device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - return tnlCudaMax( data[ idx1 ], data[ idx2 ] ); - }; - - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const - { - return tnlCudaMax( data1, data2 ); - }; - - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return tnlCudaMax( data1, data2 ); - };*/ - - -#endif }; template< typename Real, typename Index > @@ -1629,13 +724,6 @@ class tnlParallelReductionDiffAbsSum : public tnlParallelReductionMax< Real, Ind typedef Real ResultType; typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return tnlAbs( data1[ idx ] - data2[ idx ] ); - }; - ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, @@ -1653,67 +741,6 @@ class tnlParallelReductionDiffAbsSum : public tnlParallelReductionMax< Real, Ind { result += tnlCudaAbs( data1[ index ] - data2[ index ] ); } - - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ); - }; - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + - tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ) + - tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + - tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ); - }; - - /*__device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - return data[ idx1 ] + data[ idx2 ]; - }; - - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const - { - return data1 + data2; - }; - - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return data1 + data2; - };*/ - - -#endif }; template< typename Real, typename Index > @@ -1726,13 +753,6 @@ class tnlParallelReductionDiffAbsMin : public tnlParallelReductionMin< Real, Ind typedef Real ResultType; typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return tnlAbs( data1[ idx ] - data2[ idx ] ); - }; - ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, @@ -1750,69 +770,6 @@ class tnlParallelReductionDiffAbsMin : public tnlParallelReductionMin< Real, Ind { result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] - data2[ index ] ) ); } - - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), - tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ) ); - } - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMin( data1[ idx1 ], - tnlCudaMin( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), - tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ) ) ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMin( data1[ idx1 ], - tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ) ); - }; - - /*__device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - //return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); - return tnlCudaMin( data[ idx1 ], data[ idx2 ] ); - }; - - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const - { - return tnlCudaMin( data1, data2 ); - }; - - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return tnlCudaMin( data1, data2 ); - };*/ - - -#endif }; template< typename Real, typename Index > @@ -1825,13 +782,6 @@ class tnlParallelReductionDiffAbsMax : public tnlParallelReductionMax< Real, Ind typedef Real ResultType; typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return tnlAbs( data1[ idx ] -data2[ idx ] ); - }; - ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, @@ -1849,69 +799,6 @@ class tnlParallelReductionDiffAbsMax : public tnlParallelReductionMax< Real, Ind { result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] - data2[ index ] ) ); } - - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), - tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ) ); - } - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMax( data1[ idx1 ], - tnlCudaMax( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), - tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ) ) ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return tnlCudaMax( data1[ idx1 ], - tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ) ); - }; - - /*__device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - //return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); - return tnlCudaMax( data[ idx1 ], data[ idx2 ] ); - }; - - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const - { - return tnlCudaMax( data1, data2 ); - }; - - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return tnlCudaMax( data1, data2 ); - };*/ - - -#endif }; template< typename Real, typename Index > @@ -1929,13 +816,6 @@ class tnlParallelReductionDiffLpNorm : public tnlParallelReductionSum< Real, Ind this -> p = p; }; - ResultType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const - { - return pow( tnlAbs( data1[ idx ] - data2[ idx ] ), p ); - }; - ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, @@ -1947,74 +827,13 @@ class tnlParallelReductionDiffLpNorm : public tnlParallelReductionSum< Real, Ind __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; }; __cuda_callable__ void cudaFirstReduction( ResultType& result, - const IndexType index, - const RealType* data1, - const RealType* data2 ) const + const IndexType index, + const RealType* data1, + const RealType* data2 ) const { result += tnlCudaPow( tnlCudaAbs( data1[ index ] - data2[ index ] ), p ); } - -#ifdef HAVE_CUDA - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p ) + - tnlCudaPow( tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ), p ); - } - - __device__ ResultType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const - { - return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + - tnlCudaPow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p ) + - tnlCudaPow( tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ), p ); - }; - - __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const ResultType* data1, - const RealType* data2, - const RealType* data3 ) const - { - return data1[ idx1 ] + tnlCudaPow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p ); - }; - - /*__device__ ResultType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - volatile const ResultType* data ) const - { - return data[ idx1 ] + data[ idx2 ]; - }; - - __device__ ResultType commonReductionOnDevice( const ResultType& data1, - const ResultType& data2 ) const - { - return data1 + data2; - }; - - __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1, - volatile const ResultType& data2 ) const - { - return data1 + data2; - };*/ - - -#endif - protected: RealType p; diff --git a/src/core/cuda/tnlCudaReduction.h b/src/core/cuda/tnlCudaReduction.h index 563fc8fe124eae21122fc7fd709c52ccf04c293c..9b7bf7ab2cb234ecbeae74f4a571cc8383720cb0 100644 --- a/src/core/cuda/tnlCudaReduction.h +++ b/src/core/cuda/tnlCudaReduction.h @@ -20,7 +20,7 @@ #ifdef HAVE_CUDA -template< typename Operation, int blockSize, bool isSizePow2 > +template< typename Operation, int blockSize > class tnlCUDAReduction { public: @@ -37,8 +37,8 @@ class tnlCUDAReduction ResultType* output ); }; -/*template< typename Real, typename Index, int blockSize, bool isSizePow2 > -class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 > +/*template< typename Real, typename Index, int blockSize > +class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize > { public: diff --git a/src/core/cuda/tnlCudaReduction_impl.h b/src/core/cuda/tnlCudaReduction_impl.h index 20b47bbf46987d9f61992e9f25a9aa48c41e8b36..479ef7972505a2dbe7629886b2535b094371fde1 100644 --- a/src/core/cuda/tnlCudaReduction_impl.h +++ b/src/core/cuda/tnlCudaReduction_impl.h @@ -18,10 +18,10 @@ #ifndef TNLCUDAREDUCTION_IMPL_H #define TNLCUDAREDUCTION_IMPL_H -template< typename Operation, int blockSize, bool isSizePow2 > +template< typename Operation, int blockSize > __device__ void -tnlCUDAReduction< Operation, blockSize, isSizePow2 >:: +tnlCUDAReduction< Operation, blockSize >:: reduce( const Operation operation, const IndexType size, const RealType* input1, @@ -154,10 +154,10 @@ reduce( const Operation operation, #ifdef UNDEF -template< typename Real, typename Index, int blockSize, bool isSizePow2 > +template< typename Real, typename Index, int blockSize > __device__ void -tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 >:: +tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >:: reduce( const Operation operation, const IndexType size, const RealType* input1, diff --git a/tests/benchmarks/tnl-cuda-benchmarks.h b/tests/benchmarks/tnl-cuda-benchmarks.h index dc82af7423c0cfb0ef91675ef3fda1d365cf9d7a..38c108b5d90aad5232d0428c8ffa1bb9878526e6 100644 --- a/tests/benchmarks/tnl-cuda-benchmarks.h +++ b/tests/benchmarks/tnl-cuda-benchmarks.h @@ -100,20 +100,19 @@ int main( int argc, char* argv[] ) cout << bandwidth << " GB/sec." << endl; */ - Real resultHost, resultDevice; - cout << "Benchmarking scalar product on CPU: "; - timer.reset(); - timer.start(); - for( int i = 0; i < loops; i++ ) - resultHost = hostVector.scalarProduct( hostVector2 ); - timer.stop(); - bandwidth = 2 * datasetSize / timer.getTime(); - cout << bandwidth << " GB/sec." << endl; + Real resultHost, resultDevice; + cout << "Benchmarking scalar product on CPU: "; + timer.reset(); + timer.start(); + for( int i = 0; i < loops; i++ ) + resultHost = hostVector.scalarProduct( hostVector2 ); + timer.stop(); + bandwidth = 2 * datasetSize / timer.getTime(); + cout << bandwidth << " GB/sec." << endl; cout << "Benchmarking scalar product on GPU: " << endl; timer.reset(); timer.start(); - cout << "Time: " << timer.getTime() << endl; for( int i = 0; i < loops; i++ ) resultDevice = deviceVector.scalarProduct( deviceVector ); cout << "Time: " << timer.getTime() << endl; @@ -144,6 +143,31 @@ int main( int argc, char* argv[] ) cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl; #endif #endif + + cout << "Benchmarking prefix-sum on CPU ..." << endl; + timer.reset(); + timer.start(); + hostVector.computePrefixSum(); + cout << "Time: " << timer.getTime() << endl; + timer.stop(); + bandwidth = 2 * datasetSize / loops / timer.getTime(); + cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl; + + cout << "Benchmarking prefix-sum on GPU ..." << endl; + timer.reset(); + timer.start(); + deviceVector.computePrefixSum(); + cout << "Time: " << timer.getTime() << endl; + timer.stop(); + bandwidth = 2 * datasetSize / loops / timer.getTime(); + cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl; + + for( int i = 0; i < size; i++ ) + if( hostVector.getElement( i ) != deviceVector.getElement( i ) ) + { + cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << deviceVector.getElement( i ) << endl; + } + return EXIT_SUCCESS; }