Loading src/core/cuda/cuda-prefix-sum_impl.h +15 −10 Original line number Diff line number Diff line Loading @@ -86,7 +86,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) { operation.performInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ], operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ], sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); auxData[ threadIdx. x ] = sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ]; Loading @@ -100,7 +100,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT const int warpIdx = threadIdx. x / tnlCuda::getWarpSize(); for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks ) operation.performInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] ); operation.commonReductionOnDevice( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] ); if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 ) warpSums[ warpIdx ] = auxData[ threadIdx. x ]; Loading @@ -112,14 +112,14 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( warpIdx == 0 ) for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride ) operation.performInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] ); operation.commonReductionOnDevice( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] ); __syncthreads(); /**** * Shift the warp prefix-sums. */ if( warpIdx > 0 ) operation.performInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] ); operation.commonReductionOnDevice( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] ); /*** * Store the result back in global memory. Loading @@ -132,7 +132,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT DataType chunkShift( operation.initialValue() ); if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; operation.performInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift ); operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ]; idx += blockDim. x; } Loading @@ -141,10 +141,15 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( threadIdx. x == 0 ) { if( prefixSumType == exclusivePrefixSum ) auxArray[ blockIdx. x ] = operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ), { /*auxArray[ blockIdx. x ] = operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ), tnlCuda::getInterleaving( lastElementInBlock ), sharedData ); sharedData );*/ DataType aux = operation.initialValue(); operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ] ); operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] ); auxArray[ blockIdx. x ] = aux; } else auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ]; } Loading src/core/cuda/cuda-reduction_impl.h +83 −146 Original line number Diff line number Diff line Loading @@ -37,27 +37,25 @@ using namespace std; /**** * Arrays smaller than the following constant * are reduced on CPU. The constant must not be larger * than maximal CUDA grid size. */ const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256; const int minGPUReductionDataSize = 128;//65536; //16384;//1024;//256; static tnlCudaReductionBuffer cudaReductionBuffer( 8 * minGPUReductionDataSize ); #ifdef HAVE_CUDA template< typename Operation, int blockSize, bool isSizePow2 > template< typename Operation, int blockSize > __global__ void tnlCUDAReductionKernel( const Operation operation, const typename Operation :: IndexType size, const typename Operation :: RealType* input1, const typename Operation :: RealType* input2, typename Operation :: ResultType* output ) { typedef tnlCUDAReduction< Operation, blockSize, isSizePow2 > Reduction; typedef tnlCUDAReduction< Operation, blockSize > Reduction; Reduction::reduce( operation, size, input1, input2, output ); }; Loading @@ -74,63 +72,52 @@ typename Operation::IndexType reduceOnCudaDevice( const Operation& operation, const IndexType desGridSize( minGPUReductionDataSize ); dim3 blockSize( 256 ), gridSize( 0 ); gridSize. x = Min( tnlCuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); /*#ifdef CUDA_REDUCTION_PROFILING tnlTimerRT timer; timer.reset(); timer.start(); #endif */ if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) ) return false; output = cudaReductionBuffer.template getData< ResultType >(); IndexType shmem = blockSize.x * sizeof( ResultType ); /*** * Depending on the blockSize we generate appropriate template instance. */ if( isPow2( size ) ) { switch( blockSize.x ) { case 512: tnlCUDAReductionKernel< Operation, 512, true > tnlCUDAReductionKernel< Operation, 512 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 256: tnlCUDAReductionKernel< Operation, 256, true > tnlCUDAReductionKernel< Operation, 256 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 128: tnlCUDAReductionKernel< Operation, 128, true > tnlCUDAReductionKernel< Operation, 128 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 64: tnlCUDAReductionKernel< Operation, 64, true > tnlCUDAReductionKernel< Operation, 64 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 32: tnlCUDAReductionKernel< Operation, 32, true > tnlCUDAReductionKernel< Operation, 32 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 16: tnlCUDAReductionKernel< Operation, 16, true > tnlCUDAReductionKernel< Operation, 16 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 8: tnlCUDAReductionKernel< Operation, 8, true > tnlCUDAReductionKernel< Operation, 8 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 4: tnlCUDAReductionKernel< Operation, 4, true > tnlCUDAReductionKernel< Operation, 4 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 2: tnlCUDAReductionKernel< Operation, 2, true > tnlCUDAReductionKernel< Operation, 2 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 1: Loading @@ -138,59 +125,7 @@ typename Operation::IndexType reduceOnCudaDevice( const Operation& operation, default: tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); } } else { switch( blockSize.x ) { case 512: tnlCUDAReductionKernel< Operation, 512, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 256: tnlCUDAReductionKernel< Operation, 256, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 128: tnlCUDAReductionKernel< Operation, 128, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 64: tnlCUDAReductionKernel< Operation, 64, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 32: tnlCUDAReductionKernel< Operation, 32, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 16: tnlCUDAReductionKernel< Operation, 16, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 8: tnlCUDAReductionKernel< Operation, 8, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 4: tnlCUDAReductionKernel< Operation, 4, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 2: tnlCUDAReductionKernel< Operation, 2, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 1: tnlAssert( false, cerr << "blockSize should not be 1." << endl ); default: tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); } } //checkCudaDevice; /*#ifdef CUDA_REDUCTION_PROFILING //cudaThreadSynchronize(); timer.stop(); cout << " Main reduction on GPU took " << timer.getTime() << " sec. " << endl; #endif */ return gridSize. x; } #endif Loading Loading @@ -222,20 +157,21 @@ bool reductionOnCudaDevice( const Operation& operation, if( deviceInput2 && ! tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, size ) ) return false; result = operation. initialValueOnHost( 0, hostArray1, hostArray2 ); for( IndexType i = 1; i < size; i ++ ) result = operation.initialValue(); for( IndexType i = 0; i < size; i ++ ) result = operation.reduceOnHost( i, result, hostArray1, hostArray2 ); return true; } /**** * Reduce the data on the CUDA device. */ #ifdef CUDA_REDUCTION_PROFILING tnlTimerRT timer; timer.reset(); timer.start(); #endif /**** * Reduce the data on the CUDA device. */ ResultType* deviceAux1( 0 ); IndexType reducedSize = reduceOnCudaDevice( operation, size, Loading @@ -245,39 +181,40 @@ bool reductionOnCudaDevice( const Operation& operation, #ifdef CUDA_REDUCTION_PROFILING timer.stop(); cout << " Reduction on GPU to size " << reducedSize << " took " << timer.getTime() << " sec. " << endl; timer.reset(); timer.start(); #endif /*** * Transfer the reduced data from device to host. */ #ifdef CUDA_REDUCTION_PROFILING timer.reset(); timer.start(); #endif ResultType resultArray[ minGPUReductionDataSize ]; if( ! tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) ) return false; #ifdef CUDA_REDUCTION_PROFILING timer.stop(); cout << " Transferring data to CPU took " << timer.getTime() << " sec. " << endl; #endif /*** * Reduce the data on the host system. */ LaterReductionOperation laterReductionOperation; #ifdef CUDA_REDUCTION_PROFILING timer.reset(); timer.start(); #endif result = laterReductionOperation. initialValueOnHost( 0, resultArray, ( ResultType* ) 0 ); for( IndexType i = 1; i < reducedSize; i ++ ) /*** * Reduce the data on the host system. */ LaterReductionOperation laterReductionOperation; result = laterReductionOperation. initialValue(); for( IndexType i = 0; i < reducedSize; i ++ ) result = laterReductionOperation.reduceOnHost( i, result, resultArray, ( ResultType*) 0 ); #ifdef CUDA_REDUCTION_PROFILING cudaThreadSynchronize(); timer.stop(); cout << " Reduction of small data set on CPU took " << timer.getTime() << " sec. " << endl; #endif return checkCudaDevice; #else tnlCudaSupportMissingMessage;; Loading src/core/cuda/reduction-operations.h +83 −1264 File changed.Preview size limit exceeded, changes collapsed. Show changes src/core/cuda/tnlCudaReduction.h +3 −3 Original line number Diff line number Diff line Loading @@ -20,7 +20,7 @@ #ifdef HAVE_CUDA template< typename Operation, int blockSize, bool isSizePow2 > template< typename Operation, int blockSize > class tnlCUDAReduction { public: Loading @@ -37,8 +37,8 @@ class tnlCUDAReduction ResultType* output ); }; /*template< typename Real, typename Index, int blockSize, bool isSizePow2 > class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 > /*template< typename Real, typename Index, int blockSize > class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize > { public: Loading src/core/cuda/tnlCudaReduction_impl.h +4 −4 Original line number Diff line number Diff line Loading @@ -18,10 +18,10 @@ #ifndef TNLCUDAREDUCTION_IMPL_H #define TNLCUDAREDUCTION_IMPL_H template< typename Operation, int blockSize, bool isSizePow2 > template< typename Operation, int blockSize > __device__ void tnlCUDAReduction< Operation, blockSize, isSizePow2 >:: tnlCUDAReduction< Operation, blockSize >:: reduce( const Operation operation, const IndexType size, const RealType* input1, Loading Loading @@ -154,10 +154,10 @@ reduce( const Operation operation, #ifdef UNDEF template< typename Real, typename Index, int blockSize, bool isSizePow2 > template< typename Real, typename Index, int blockSize > __device__ void tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 >:: tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >:: reduce( const Operation operation, const IndexType size, const RealType* input1, Loading Loading
src/core/cuda/cuda-prefix-sum_impl.h +15 −10 Original line number Diff line number Diff line Loading @@ -86,7 +86,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) { operation.performInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ], operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ], sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); auxData[ threadIdx. x ] = sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ]; Loading @@ -100,7 +100,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT const int warpIdx = threadIdx. x / tnlCuda::getWarpSize(); for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks ) operation.performInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] ); operation.commonReductionOnDevice( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] ); if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 ) warpSums[ warpIdx ] = auxData[ threadIdx. x ]; Loading @@ -112,14 +112,14 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( warpIdx == 0 ) for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride ) operation.performInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] ); operation.commonReductionOnDevice( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] ); __syncthreads(); /**** * Shift the warp prefix-sums. */ if( warpIdx > 0 ) operation.performInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] ); operation.commonReductionOnDevice( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] ); /*** * Store the result back in global memory. Loading @@ -132,7 +132,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT DataType chunkShift( operation.initialValue() ); if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; operation.performInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift ); operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ]; idx += blockDim. x; } Loading @@ -141,10 +141,15 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( threadIdx. x == 0 ) { if( prefixSumType == exclusivePrefixSum ) auxArray[ blockIdx. x ] = operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ), { /*auxArray[ blockIdx. x ] = operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ), tnlCuda::getInterleaving( lastElementInBlock ), sharedData ); sharedData );*/ DataType aux = operation.initialValue(); operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ] ); operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] ); auxArray[ blockIdx. x ] = aux; } else auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ]; } Loading
src/core/cuda/cuda-reduction_impl.h +83 −146 Original line number Diff line number Diff line Loading @@ -37,27 +37,25 @@ using namespace std; /**** * Arrays smaller than the following constant * are reduced on CPU. The constant must not be larger * than maximal CUDA grid size. */ const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256; const int minGPUReductionDataSize = 128;//65536; //16384;//1024;//256; static tnlCudaReductionBuffer cudaReductionBuffer( 8 * minGPUReductionDataSize ); #ifdef HAVE_CUDA template< typename Operation, int blockSize, bool isSizePow2 > template< typename Operation, int blockSize > __global__ void tnlCUDAReductionKernel( const Operation operation, const typename Operation :: IndexType size, const typename Operation :: RealType* input1, const typename Operation :: RealType* input2, typename Operation :: ResultType* output ) { typedef tnlCUDAReduction< Operation, blockSize, isSizePow2 > Reduction; typedef tnlCUDAReduction< Operation, blockSize > Reduction; Reduction::reduce( operation, size, input1, input2, output ); }; Loading @@ -74,63 +72,52 @@ typename Operation::IndexType reduceOnCudaDevice( const Operation& operation, const IndexType desGridSize( minGPUReductionDataSize ); dim3 blockSize( 256 ), gridSize( 0 ); gridSize. x = Min( tnlCuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); /*#ifdef CUDA_REDUCTION_PROFILING tnlTimerRT timer; timer.reset(); timer.start(); #endif */ if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) ) return false; output = cudaReductionBuffer.template getData< ResultType >(); IndexType shmem = blockSize.x * sizeof( ResultType ); /*** * Depending on the blockSize we generate appropriate template instance. */ if( isPow2( size ) ) { switch( blockSize.x ) { case 512: tnlCUDAReductionKernel< Operation, 512, true > tnlCUDAReductionKernel< Operation, 512 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 256: tnlCUDAReductionKernel< Operation, 256, true > tnlCUDAReductionKernel< Operation, 256 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 128: tnlCUDAReductionKernel< Operation, 128, true > tnlCUDAReductionKernel< Operation, 128 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 64: tnlCUDAReductionKernel< Operation, 64, true > tnlCUDAReductionKernel< Operation, 64 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 32: tnlCUDAReductionKernel< Operation, 32, true > tnlCUDAReductionKernel< Operation, 32 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 16: tnlCUDAReductionKernel< Operation, 16, true > tnlCUDAReductionKernel< Operation, 16 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 8: tnlCUDAReductionKernel< Operation, 8, true > tnlCUDAReductionKernel< Operation, 8 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 4: tnlCUDAReductionKernel< Operation, 4, true > tnlCUDAReductionKernel< Operation, 4 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 2: tnlCUDAReductionKernel< Operation, 2, true > tnlCUDAReductionKernel< Operation, 2 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 1: Loading @@ -138,59 +125,7 @@ typename Operation::IndexType reduceOnCudaDevice( const Operation& operation, default: tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); } } else { switch( blockSize.x ) { case 512: tnlCUDAReductionKernel< Operation, 512, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 256: tnlCUDAReductionKernel< Operation, 256, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 128: tnlCUDAReductionKernel< Operation, 128, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 64: tnlCUDAReductionKernel< Operation, 64, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 32: tnlCUDAReductionKernel< Operation, 32, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 16: tnlCUDAReductionKernel< Operation, 16, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 8: tnlCUDAReductionKernel< Operation, 8, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 4: tnlCUDAReductionKernel< Operation, 4, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 2: tnlCUDAReductionKernel< Operation, 2, false > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 1: tnlAssert( false, cerr << "blockSize should not be 1." << endl ); default: tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); } } //checkCudaDevice; /*#ifdef CUDA_REDUCTION_PROFILING //cudaThreadSynchronize(); timer.stop(); cout << " Main reduction on GPU took " << timer.getTime() << " sec. " << endl; #endif */ return gridSize. x; } #endif Loading Loading @@ -222,20 +157,21 @@ bool reductionOnCudaDevice( const Operation& operation, if( deviceInput2 && ! tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, size ) ) return false; result = operation. initialValueOnHost( 0, hostArray1, hostArray2 ); for( IndexType i = 1; i < size; i ++ ) result = operation.initialValue(); for( IndexType i = 0; i < size; i ++ ) result = operation.reduceOnHost( i, result, hostArray1, hostArray2 ); return true; } /**** * Reduce the data on the CUDA device. */ #ifdef CUDA_REDUCTION_PROFILING tnlTimerRT timer; timer.reset(); timer.start(); #endif /**** * Reduce the data on the CUDA device. */ ResultType* deviceAux1( 0 ); IndexType reducedSize = reduceOnCudaDevice( operation, size, Loading @@ -245,39 +181,40 @@ bool reductionOnCudaDevice( const Operation& operation, #ifdef CUDA_REDUCTION_PROFILING timer.stop(); cout << " Reduction on GPU to size " << reducedSize << " took " << timer.getTime() << " sec. " << endl; timer.reset(); timer.start(); #endif /*** * Transfer the reduced data from device to host. */ #ifdef CUDA_REDUCTION_PROFILING timer.reset(); timer.start(); #endif ResultType resultArray[ minGPUReductionDataSize ]; if( ! tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) ) return false; #ifdef CUDA_REDUCTION_PROFILING timer.stop(); cout << " Transferring data to CPU took " << timer.getTime() << " sec. " << endl; #endif /*** * Reduce the data on the host system. */ LaterReductionOperation laterReductionOperation; #ifdef CUDA_REDUCTION_PROFILING timer.reset(); timer.start(); #endif result = laterReductionOperation. initialValueOnHost( 0, resultArray, ( ResultType* ) 0 ); for( IndexType i = 1; i < reducedSize; i ++ ) /*** * Reduce the data on the host system. */ LaterReductionOperation laterReductionOperation; result = laterReductionOperation. initialValue(); for( IndexType i = 0; i < reducedSize; i ++ ) result = laterReductionOperation.reduceOnHost( i, result, resultArray, ( ResultType*) 0 ); #ifdef CUDA_REDUCTION_PROFILING cudaThreadSynchronize(); timer.stop(); cout << " Reduction of small data set on CPU took " << timer.getTime() << " sec. " << endl; #endif return checkCudaDevice; #else tnlCudaSupportMissingMessage;; Loading
src/core/cuda/reduction-operations.h +83 −1264 File changed.Preview size limit exceeded, changes collapsed. Show changes
src/core/cuda/tnlCudaReduction.h +3 −3 Original line number Diff line number Diff line Loading @@ -20,7 +20,7 @@ #ifdef HAVE_CUDA template< typename Operation, int blockSize, bool isSizePow2 > template< typename Operation, int blockSize > class tnlCUDAReduction { public: Loading @@ -37,8 +37,8 @@ class tnlCUDAReduction ResultType* output ); }; /*template< typename Real, typename Index, int blockSize, bool isSizePow2 > class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 > /*template< typename Real, typename Index, int blockSize > class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize > { public: Loading
src/core/cuda/tnlCudaReduction_impl.h +4 −4 Original line number Diff line number Diff line Loading @@ -18,10 +18,10 @@ #ifndef TNLCUDAREDUCTION_IMPL_H #define TNLCUDAREDUCTION_IMPL_H template< typename Operation, int blockSize, bool isSizePow2 > template< typename Operation, int blockSize > __device__ void tnlCUDAReduction< Operation, blockSize, isSizePow2 >:: tnlCUDAReduction< Operation, blockSize >:: reduce( const Operation operation, const IndexType size, const RealType* input1, Loading Loading @@ -154,10 +154,10 @@ reduce( const Operation operation, #ifdef UNDEF template< typename Real, typename Index, int blockSize, bool isSizePow2 > template< typename Real, typename Index, int blockSize > __device__ void tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 >:: tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >:: reduce( const Operation operation, const IndexType size, const RealType* input1, Loading