From c60aa9e42a4afdb71e317c753dfac73ddc19858c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Thu, 11 Apr 2019 21:53:36 +0200 Subject: [PATCH 1/9] Rewritting parallel reduciton with lambda functions. --- .../Algorithms/CudaReductionKernel.h | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index a5823849d..08a767c60 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -39,14 +39,19 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi static constexpr int Reduction_minBlocksPerMultiprocessor = 4; #endif -template< int blockSize, typename Operation, typename Index > +template< int blockSize, + typename Real, + typename FirstPhase, + typename SecondPhase, + typename Index, + typename ResultType = decltype( std::declval< FirstPhase >( 0,0 ) ) > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) -CudaReductionKernel( Operation operation, +CudaReductionKernel( const Real& initialValue, + FirstReduction& firstReduction, + SecondReduction& secondReduction, const Index size, - const typename Operation::DataType1* input1, - const typename Operation::DataType2* input2, - typename Operation::ResultType* output ) + ResultType* output ) { typedef Index IndexType; typedef typename Operation::ResultType ResultType; @@ -62,23 +67,23 @@ CudaReductionKernel( Operation operation, IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; - sdata[ tid ] = operation.initialValue(); + sdata[ tid ] = initialValue; /*** * Read data into the shared memory. We start with the * sequential reduction. */ while( gid + 4 * gridSize < size ) { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 ); + sdata[ tid ] = firstReduction( sdata[ tid ], gid ); + sdata[ tid ] = firstReduction( sdata[ tid ], gid + gridSize ); + sdata[ tid ] = firstReduction( sdata[ tid ], gid + 2 * gridSize ); + sdata[ tid ] = firstReduction( sdata[ tid ], gid + 3 * gridSize ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); + firstReduction( sdata[ tid ], gid, input1, input2 ); + firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); gid += 2 * gridSize; } while( gid < size ) -- GitLab From 5c983e498eb31ae1d19d82e201fe589482b55a4c Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Fri, 12 Apr 2019 15:54:53 +0200 Subject: [PATCH 2/9] [WIP] Rewritting reduction using lambda functions. --- .../Algorithms/ArrayOperationsCuda.hpp | 7 +- .../Algorithms/CudaReductionKernel.h | 112 +++++++------- src/TNL/Containers/Algorithms/Reduction.h | 49 +++--- .../Containers/Algorithms/Reduction_impl.h | 140 ++++++++++-------- .../Algorithms/VectorOperationsHost_impl.h | 24 ++- src/TNL/Math.h | 13 +- 6 files changed, 196 insertions(+), 149 deletions(-) diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index 3efe7e4d4..ab8770013 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -220,9 +220,10 @@ containsValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; - Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; - reductionContainsValue.setValue( value ); - return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr ); + //Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; + //reductionContainsValue.setValue( value ); + auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; + return Reduction< Devices::Cuda >::reduce( size, TNL::sum, fetch, 0.0 ); } template< typename Element, diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 08a767c60..0a71aaa79 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -41,20 +41,21 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi template< int blockSize, typename Real, - typename FirstPhase, - typename SecondPhase, + typename DataFetcher, + typename Reduction, typename Index, - typename ResultType = decltype( std::declval< FirstPhase >( 0,0 ) ) > + typename Result = decltype( std::declval< DataFetcher >( 0,0 ) ) > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) -CudaReductionKernel( const Real& initialValue, - FirstReduction& firstReduction, - SecondReduction& secondReduction, +CudaReductionKernel( const Real& zero, + const DataFetcher& dataFetcher, + const Reduction& reduction, const Index size, - ResultType* output ) + Result* output ) { - typedef Index IndexType; - typedef typename Operation::ResultType ResultType; + using RealType = Real; + using IndexType = Index; + using ResultType = Result; ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); @@ -67,28 +68,28 @@ CudaReductionKernel( const Real& initialValue, IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; - sdata[ tid ] = initialValue; + sdata[ tid ] = zero; /*** * Read data into the shared memory. We start with the * sequential reduction. */ while( gid + 4 * gridSize < size ) { - sdata[ tid ] = firstReduction( sdata[ tid ], gid ); - sdata[ tid ] = firstReduction( sdata[ tid ], gid + gridSize ); - sdata[ tid ] = firstReduction( sdata[ tid ], gid + 2 * gridSize ); - sdata[ tid ] = firstReduction( sdata[ tid ], gid + 3 * gridSize ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { - firstReduction( sdata[ tid ], gid, input1, input2 ); - firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; } while( gid < size ) { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); @@ -103,19 +104,19 @@ CudaReductionKernel( const Real& initialValue, if( blockSize >= 1024 ) { if( tid < 512 ) - operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] ); + reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) - operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] ); + reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) - operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] ); + reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); //printf( "2: tid %d data %f \n", tid, sdata[ tid ] ); } @@ -123,7 +124,7 @@ CudaReductionKernel( const Real& initialValue, if( blockSize >= 128 ) { if( tid < 64 ) - operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] ); + reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); //printf( "3: tid %d data %f \n", tid, sdata[ tid ] ); } @@ -131,40 +132,40 @@ CudaReductionKernel( const Real& initialValue, /*** * This runs in one warp so it is synchronized implicitly. - */ + */ if( tid < 32 ) { volatile ResultType* vsdata = sdata; if( blockSize >= 64 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); + reduction( vsdata[ tid ], vsdata[ tid + 32 ] ); //printf( "4: tid %d data %f \n", tid, sdata[ tid ] ); } // TODO: If blocksize == 32, the following does not work // We do not check if tid < 16. Fix it!!! if( blockSize >= 32 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); + reduction( vsdata[ tid ], vsdata[ tid + 16 ] ); //printf( "5: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 16 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); + reduction( vsdata[ tid ], vsdata[ tid + 8 ] ); //printf( "6: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 8 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); + reduction( vsdata[ tid ], vsdata[ tid + 4 ] ); //printf( "7: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 4 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); + reduction( vsdata[ tid ], vsdata[ tid + 2 ] ); //printf( "8: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 2 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); + reduction( vsdata[ tid ], vsdata[ tid + 1 ] ); //printf( "9: tid %d data %f \n", tid, sdata[ tid ] ); } } @@ -180,16 +181,21 @@ CudaReductionKernel( const Real& initialValue, } -template< typename Operation, typename Index > +template< typename Real, + typename DataFetcher, + typename Reduction, + typename Index, + typename Result > int -CudaReductionKernelLauncher( Operation& operation, - const Index size, - const typename Operation::DataType1* input1, - const typename Operation::DataType2* input2, - typename Operation::ResultType*& output ) +CudaReductionKernelLauncher( const Index size, + const Reduction& reduction, + const DataFetcher& dataFetcher, + const Real& zero, + Result*& output ) { - typedef Index IndexType; - typedef typename Operation::ResultType ResultType; + using RealType = Real; + using IndexType = Index; + using ResultType = Result; // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because @@ -227,55 +233,55 @@ CudaReductionKernelLauncher( Operation& operation, { case 512: CudaReductionKernel< 512 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 256: - cudaFuncSetCacheConfig(CudaReductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 256, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 256 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 128: - cudaFuncSetCacheConfig(CudaReductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 128, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 128 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 64: - cudaFuncSetCacheConfig(CudaReductionKernel< 64, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 64, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 64 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 32: - cudaFuncSetCacheConfig(CudaReductionKernel< 32, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 32, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 32 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 16: - cudaFuncSetCacheConfig(CudaReductionKernel< 16, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 16, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 16 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 8: - cudaFuncSetCacheConfig(CudaReductionKernel< 8, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 8, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 8 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 4: - cudaFuncSetCacheConfig(CudaReductionKernel< 4, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 4, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 4 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 2: - cudaFuncSetCacheConfig(CudaReductionKernel< 2, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 2, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 2 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 1: TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h index d4f45f30e..6eb07f999 100644 --- a/src/TNL/Containers/Algorithms/Reduction.h +++ b/src/TNL/Containers/Algorithms/Reduction.h @@ -10,7 +10,7 @@ // Implemented by: Tomas Oberhuber, Jakub Klinkovsky -#pragma once +#pragma once #include #include @@ -18,7 +18,7 @@ namespace TNL { namespace Containers { -namespace Algorithms { +namespace Algorithms { template< typename Device > class Reduction @@ -29,36 +29,45 @@ template<> class Reduction< Devices::Cuda > { public: - template< typename Operation, typename Index > - static typename Operation::ResultType - reduce( Operation& operation, - const Index size, - const typename Operation::DataType1* deviceInput1, - const typename Operation::DataType2* deviceInput2 ); + template< typename Index, + typename ReductionOperation, + typename DataFetcher, + typename Result = decltype( DataFetcher::operator() ) > + static Result + reduce( const Index size, + ReductionOperation& reduction, + DataFetcher& dataFetcher, + const Result& zero ); }; template<> class Reduction< Devices::Host > { public: - template< typename Operation, typename Index > - static typename Operation::ResultType - reduce( Operation& operation, - const Index size, - const typename Operation::DataType1* deviceInput1, - const typename Operation::DataType2* deviceInput2 ); + template< typename Index, + typename ReductionOperation, + typename DataFetcher, + typename Result = decltype( DataFetcher::operator() ) > + static Result + reduce( const Index size, + ReductionOperation& reduction, + DataFetcher& dataFetcher, + const Result& zero ); }; template<> class Reduction< Devices::MIC > { public: - template< typename Operation, typename Index > - static typename Operation::ResultType - reduce( Operation& operation, - const Index size, - const typename Operation::DataType1* deviceInput1, - const typename Operation::DataType2* deviceInput2 ); + template< typename Index, + typename ReductionOperation, + typename DataFetcher, + typename Result = decltype( DataFetcher::operator() ) > + static Result + reduce( const Index size, + ReductionOperation& reduction, + DataFetcher& dataFetcher, + const Result& zero ); }; } // namespace Algorithms diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction_impl.h index b6a856249..375e6d004 100644 --- a/src/TNL/Containers/Algorithms/Reduction_impl.h +++ b/src/TNL/Containers/Algorithms/Reduction_impl.h @@ -38,35 +38,37 @@ namespace Algorithms { */ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256; -template< typename Operation, typename Index > -typename Operation::ResultType + +template< typename Index, + typename ReductionOperation, + typename DataFetcher, + typename Result > +Result Reduction< Devices::Cuda >:: -reduce( Operation& operation, - const Index size, - const typename Operation::DataType1* deviceInput1, - const typename Operation::DataType2* deviceInput2 ) + reduce( const Index size, + ReductionOperation& reduction, + DataFetcher& dataFetcher, + const Result& zero ) { #ifdef HAVE_CUDA - typedef Index IndexType; - typedef typename Operation::DataType1 DataType1; - typedef typename Operation::DataType2 DataType2; - typedef typename Operation::ResultType ResultType; - typedef typename Operation::LaterReductionOperation LaterReductionOperation; + using IndexType = Index; + using ResultType = Result; /*** * Only fundamental and pointer types can be safely reduced on host. Complex * objects stored on the device might contain pointers into the device memory, * in which case reduction on host might fail. */ - constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; + //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value; /*** * First check if the input array(s) is/are large enough for the reduction on GPU. * Otherwise copy it/them to host and reduce on CPU. */ - if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize ) + // With lambda function we cannot reduce data on host - we do not know the data here. + /*if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize ) { typename std::remove_const< DataType1 >::type hostArray1[ Reduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, size ); @@ -74,12 +76,12 @@ reduce( Operation& operation, using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type; typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size ); - return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2 ); + return Reduction< Devices::Host >::reduce( zero, dataFetcher, reduction, size, hostArray1, hostArray2 ); } else { return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr ); } - } + }*/ #ifdef CUDA_REDUCTION_PROFILING Timer timer; @@ -91,10 +93,10 @@ reduce( Operation& operation, * Reduce the data on the CUDA device. */ ResultType* deviceAux1( 0 ); - IndexType reducedSize = CudaReductionKernelLauncher( operation, - size, - deviceInput1, - deviceInput2, + IndexType reducedSize = CudaReductionKernelLauncher( size, + reduction, + dataFetcher, + zero, deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -120,8 +122,8 @@ reduce( Operation& operation, /*** * Reduce the data on the host system. */ - LaterReductionOperation laterReductionOperation; - const ResultType result = Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr ); + auto copyFetch = [=] ( IndexType i ) { return resultArray[ i ]; }; + const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, copyFetch, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -134,12 +136,13 @@ reduce( Operation& operation, /*** * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. */ - LaterReductionOperation laterReductionOperation; + //LaterReductionOperation laterReductionOperation; + auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; }; while( reducedSize > 1 ) { - reducedSize = CudaReductionKernelLauncher( laterReductionOperation, - reducedSize, - deviceAux1, - (ResultType*) 0, + reducedSize = CudaReductionKernelLauncher( reducedSize, + reduction, + copyFetch, + zero, deviceAux1 ); } @@ -166,18 +169,19 @@ reduce( Operation& operation, #endif }; -template< typename Operation, typename Index > -typename Operation::ResultType +template< typename Index, + typename ReductionOperation, + typename DataFetcher, + typename Result > +Result Reduction< Devices::Host >:: -reduce( Operation& operation, - const Index size, - const typename Operation::DataType1* input1, - const typename Operation::DataType2* input2 ) + reduce( const Index size, + ReductionOperation& reduction, + DataFetcher& dataFetcher, + const Result& zero ) { - typedef Index IndexType; - typedef typename Operation::DataType1 DataType1; - typedef typename Operation::DataType2 DataType2; - typedef typename Operation::ResultType ResultType; + using IndexType = Index; + using ResultType = Result; constexpr int block_size = 128; const int blocks = size / block_size; @@ -185,23 +189,24 @@ reduce( Operation& operation, #ifdef HAVE_OPENMP if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { // global result variable - ResultType result = operation.initialValue(); + ResultType result = zero; #pragma omp parallel { // initialize array for thread-local results - ResultType r[ 4 ] = { operation.initialValue(), - operation.initialValue(), - operation.initialValue(), - operation.initialValue() }; + ResultType r[ 4 ] = { zero, zero, zero, zero }; #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { - operation.firstReduction( r[ 0 ], offset + i, input1, input2 ); - operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 ); - operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 ); - operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 ); + r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); + r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); + r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); + r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); + /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 ); + operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 ); + operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 ); + operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/ } } @@ -209,18 +214,19 @@ reduce( Operation& operation, #pragma omp single nowait { for( IndexType i = blocks * block_size; i < size; i++ ) - operation.firstReduction( r[ 0 ], i, input1, input2 ); + r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); + //operation.dataFetcher( r[ 0 ], i, input1, input2 ); } // local reduction of unrolled results - operation.commonReduction( r[ 0 ], r[ 2 ] ); - operation.commonReduction( r[ 1 ], r[ 3 ] ); - operation.commonReduction( r[ 0 ], r[ 1 ] ); + r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); + r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); + r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); // inter-thread reduction of local results #pragma omp critical { - operation.commonReduction( result, r[ 0 ] ); + result = reduction( result, r[ 0 ] ); } } return result; @@ -229,37 +235,43 @@ reduce( Operation& operation, #endif if( blocks > 1 ) { // initialize array for unrolled results - ResultType r[ 4 ] = { operation.initialValue(), - operation.initialValue(), - operation.initialValue(), - operation.initialValue() }; + ResultType r[ 4 ] = { zero, zero, zero, zero }; // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { - operation.firstReduction( r[ 0 ], offset + i, input1, input2 ); - operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 ); - operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 ); - operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 ); + r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); + r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); + r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); + r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); + /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 ); + operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 ); + operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 ); + operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/ } } // reduction of the last, incomplete block (not unrolled) for( IndexType i = blocks * block_size; i < size; i++ ) - operation.firstReduction( r[ 0 ], i, input1, input2 ); + r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); + //operation.dataFetcher( r[ 0 ], i, input1, input2 ); // reduction of unrolled results - operation.commonReduction( r[ 0 ], r[ 2 ] ); - operation.commonReduction( r[ 1 ], r[ 3 ] ); - operation.commonReduction( r[ 0 ], r[ 1 ] ); + r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); + r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); + r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); + + /*operation.reduction( r[ 0 ], r[ 2 ] ); + operation.reduction( r[ 1 ], r[ 3 ] ); + operation.reduction( r[ 0 ], r[ 1 ] );*/ return r[ 0 ]; } else { - ResultType result = operation.initialValue(); + ResultType result = zero; for( IndexType i = 0; i < size; i++ ) - operation.firstReduction( result, i, input1, input2 ); + result = reduction( result, dataFetcher( i ) ); return result; } #ifdef HAVE_OPENMP diff --git a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h index 67400da64..d324806ab 100644 --- a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h +++ b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h @@ -46,15 +46,21 @@ ResultType VectorOperations< Devices::Host >:: getVectorMax( const Vector& v ) { - typedef typename Vector::RealType RealType; + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - Algorithms::ParallelReductionMax< RealType, ResultType > operation; + /*Algorithms::ParallelReductionMax< RealType, ResultType > operation; return Reduction< Devices::Host >::reduce( operation, v.getSize(), v.getData(), - ( RealType* ) 0 ); + ( RealType* ) 0 );*/ + const RealType* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ]; }; + auto reduction = [=] __cuda_callable__ ( const RealType& a, const RealType& b ) { return TNL::max( a, b ); }; + return Reduction< Devices::Host >::reduce( v.getSize(), reduction, fetch, 0.0 ); + } template< typename Vector, typename ResultType > @@ -78,15 +84,21 @@ ResultType VectorOperations< Devices::Host >:: getVectorAbsMax( const Vector& v ) { - typedef typename Vector::RealType RealType; + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - Algorithms::ParallelReductionAbsMax< RealType, ResultType > operation; + /*Algorithms::ParallelReductionAbsMax< RealType, ResultType > operation; return Reduction< Devices::Host >::reduce( operation, v.getSize(), v.getData(), - ( RealType* ) 0 ); + ( RealType* ) 0 );*/ + + const RealType* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; + auto max = [=] __cuda_callable__ ( const RealType& a, const RealType& b ) { return TNL::max( a, b ); }; + return Reduction< Devices::Host >::reduce( v.getSize(), max, fetch, 0.0 ); } diff --git a/src/TNL/Math.h b/src/TNL/Math.h index 68eb1f556..dcfa91a34 100644 --- a/src/TNL/Math.h +++ b/src/TNL/Math.h @@ -8,7 +8,7 @@ /* See Copyright Notice in tnl/Copyright */ -#pragma once +#pragma once #include #include @@ -18,6 +18,13 @@ namespace TNL { +template< typename T1, typename T2, typename ResultType = typename std::common_type< T1, T2 >::type > +__cuda_callable__ inline +ResultType sum( const T1& a, const T2& b ) +{ + return a + b; +} + /** * \brief This function returns minimum of two numbers. * @@ -105,7 +112,7 @@ template< typename T1, typename T2, typename ResultType = typename std::common_t __cuda_callable__ ResultType argMax( const T1& a, const T2& b ) { - return ( a > b ) ? a : b; + return ( a > b ) ? a : b; } /*** @@ -125,7 +132,7 @@ template< typename T1, typename T2, typename ResultType = typename std::common_t __cuda_callable__ ResultType argAbsMax( const T1& a, const T2& b ) { - return ( TNL::abs( a ) > TNL::abs( b ) ) ? a : b; + return ( TNL::abs( a ) > TNL::abs( b ) ) ? a : b; } /** -- GitLab From 8c1c80e2428d96fcd1259586a227688f24a74d5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 13 Apr 2019 20:26:02 +0200 Subject: [PATCH 3/9] [WIP] New lambda function based reduction compiles but does not work in CUDA. --- .../Containers/Algorithms/ArrayAssignment.h | 2 - .../Algorithms/ArrayOperationsCuda.hpp | 34 +- .../Algorithms/CommonVectorOperations.h | 79 +++++ .../Algorithms/CudaReductionKernel.h | 62 ++-- src/TNL/Containers/Algorithms/Reduction.h | 12 +- .../Containers/Algorithms/Reduction_impl.h | 19 +- .../Containers/Algorithms/VectorOperations.h | 19 +- .../Algorithms/VectorOperationsCuda_impl.h | 8 +- .../Algorithms/VectorOperationsHost_impl.h | 293 +----------------- .../Containers/ArrayOperationsTest.h | 6 +- 10 files changed, 172 insertions(+), 362 deletions(-) create mode 100644 src/TNL/Containers/Algorithms/CommonVectorOperations.h diff --git a/src/TNL/Containers/Algorithms/ArrayAssignment.h b/src/TNL/Containers/Algorithms/ArrayAssignment.h index 62ab43a84..7f5b8482e 100644 --- a/src/TNL/Containers/Algorithms/ArrayAssignment.h +++ b/src/TNL/Containers/Algorithms/ArrayAssignment.h @@ -86,8 +86,6 @@ struct ArrayAssignment< Array, T, false > }; - - } // namespace Algorithms } // namespace Containers } // namespace TNL diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index ab8770013..0570f10f4 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -205,8 +205,19 @@ compareMemory( const Element1* destination, { TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); - Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; - return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source ); + + Element1* d; + cudaMalloc( ( void** ) &d, size * sizeof( Element1 ) ); + auto fetch = [=] __cuda_callable__ ( Index i ) { return d[ 0 ]; }; //( destination[ i ] == source[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }; + return Reduction< Devices::Cuda >::reduce( + size, + reduction, //[=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }, + fetch, //[=] __cuda_callable__ ( Index i ) { return destination[ i ]; }, + true ); + + /*Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; + return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );*/ } template< typename Element, @@ -219,11 +230,11 @@ containsValue( const Element* data, { TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); + if( size == 0 ) return false; - //Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; - //reductionContainsValue.setValue( value ); - auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; - return Reduction< Devices::Cuda >::reduce( size, TNL::sum, fetch, 0.0 ); + auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; + auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a || b; }; + return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false ); } template< typename Element, @@ -237,9 +248,16 @@ containsOnlyValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; - Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; + + if( size == 0 ) return false; + auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; + auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }; + return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); + + + /*Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; reductionContainsOnlyValue.setValue( value ); - return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr ); + return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );*/ } diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.h b/src/TNL/Containers/Algorithms/CommonVectorOperations.h new file mode 100644 index 000000000..bd362fbab --- /dev/null +++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.h @@ -0,0 +1,79 @@ +/*************************************************************************** + CommonVectorOperations.h - description + ------------------- + begin : Apr 12, 2019 + copyright : (C) 2019 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +namespace TNL { +namespace Containers { +namespace Algorithms { + +template< typename Device > +struct CommonVectorOperations +{ + using DeviceType = Device; + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorMax( const Vector& v ); + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorMin( const Vector& v ); + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorAbsMax( const Vector& v ); + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorAbsMin( const Vector& v ); + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorL1Norm( const Vector& v ); + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorL2Norm( const Vector& v ); + + template< typename Vector, typename ResultType = typename Vector::RealType, typename Scalar > + static ResultType getVectorLpNorm( const Vector& v, const Scalar p ); + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorSum( const Vector& v ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceMax( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceMin( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceAbsMax( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceAbsMin( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceL1Norm( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceL2Norm( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType, typename Scalar > + static ResultType getVectorDifferenceLpNorm( const Vector1& v1, const Vector2& v2, const Scalar p ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 ); + +}; + +} // namespace Algorithms +} // namespace Containers +} // namespace TNL + +#include diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 0a71aaa79..685a3a332 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -40,23 +40,21 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi #endif template< int blockSize, - typename Real, + typename Result, typename DataFetcher, typename Reduction, - typename Index, - typename Result = decltype( std::declval< DataFetcher >( 0,0 ) ) > + typename Index > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) -CudaReductionKernel( const Real& zero, +CudaReductionKernel( const Result zero, const DataFetcher& dataFetcher, const Reduction& reduction, const Index size, Result* output ) { - using RealType = Real; using IndexType = Index; using ResultType = Result; - + ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); /*** @@ -69,11 +67,12 @@ CudaReductionKernel( const Real& zero, const IndexType gridSize = blockDim.x * gridDim.x; sdata[ tid ] = zero; + /*** * Read data into the shared memory. We start with the * sequential reduction. */ - while( gid + 4 * gridSize < size ) + /*while( gid + 4 * gridSize < size ) { sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); @@ -86,15 +85,14 @@ CudaReductionKernel( const Real& zero, sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; - } + }*/ while( gid < size ) { - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); + sdata[ tid ] = dataFetcher( gid ); //reduction( sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); - - + return; //printf( "1: tid %d data %f \n", tid, sdata[ tid ] ); //return; @@ -104,19 +102,19 @@ CudaReductionKernel( const Real& zero, if( blockSize >= 1024 ) { if( tid < 512 ) - reduction( sdata[ tid ], sdata[ tid + 512 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) - reduction( sdata[ tid ], sdata[ tid + 256 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) - reduction( sdata[ tid ], sdata[ tid + 128 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); //printf( "2: tid %d data %f \n", tid, sdata[ tid ] ); } @@ -124,7 +122,7 @@ CudaReductionKernel( const Real& zero, if( blockSize >= 128 ) { if( tid < 64 ) - reduction( sdata[ tid ], sdata[ tid + 64 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); //printf( "3: tid %d data %f \n", tid, sdata[ tid ] ); } @@ -138,34 +136,34 @@ CudaReductionKernel( const Real& zero, volatile ResultType* vsdata = sdata; if( blockSize >= 64 ) { - reduction( vsdata[ tid ], vsdata[ tid + 32 ] ); + vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 32 ] ); //printf( "4: tid %d data %f \n", tid, sdata[ tid ] ); } // TODO: If blocksize == 32, the following does not work // We do not check if tid < 16. Fix it!!! if( blockSize >= 32 ) { - reduction( vsdata[ tid ], vsdata[ tid + 16 ] ); + vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 16 ] ); //printf( "5: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 16 ) { - reduction( vsdata[ tid ], vsdata[ tid + 8 ] ); + vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 8 ] ); //printf( "6: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 8 ) { - reduction( vsdata[ tid ], vsdata[ tid + 4 ] ); + vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 4 ] ); //printf( "7: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 4 ) { - reduction( vsdata[ tid ], vsdata[ tid + 2 ] ); + vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 2 ] ); //printf( "8: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 2 ) { - reduction( vsdata[ tid ], vsdata[ tid + 1 ] ); + vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 1 ] ); //printf( "9: tid %d data %f \n", tid, sdata[ tid ] ); } } @@ -181,8 +179,7 @@ CudaReductionKernel( const Real& zero, } -template< typename Real, - typename DataFetcher, +template< typename DataFetcher, typename Reduction, typename Index, typename Result > @@ -190,10 +187,9 @@ int CudaReductionKernelLauncher( const Index size, const Reduction& reduction, const DataFetcher& dataFetcher, - const Real& zero, + const Result& zero, Result*& output ) { - using RealType = Real; using IndexType = Index; using ResultType = Result; @@ -236,49 +232,49 @@ CudaReductionKernelLauncher( const Index size, <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 256: - cudaFuncSetCacheConfig(CudaReductionKernel< 256, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 256 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 128: - cudaFuncSetCacheConfig(CudaReductionKernel< 128, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 128 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 64: - cudaFuncSetCacheConfig(CudaReductionKernel< 64, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 64 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 32: - cudaFuncSetCacheConfig(CudaReductionKernel< 32, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 32 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 16: - cudaFuncSetCacheConfig(CudaReductionKernel< 16, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 16 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 8: - cudaFuncSetCacheConfig(CudaReductionKernel< 8, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 8 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 4: - cudaFuncSetCacheConfig(CudaReductionKernel< 4, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 4 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 2: - cudaFuncSetCacheConfig(CudaReductionKernel< 2, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 2 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h index 6eb07f999..ece9016c5 100644 --- a/src/TNL/Containers/Algorithms/Reduction.h +++ b/src/TNL/Containers/Algorithms/Reduction.h @@ -30,9 +30,9 @@ class Reduction< Devices::Cuda > { public: template< typename Index, + typename Result, typename ReductionOperation, - typename DataFetcher, - typename Result = decltype( DataFetcher::operator() ) > + typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, @@ -45,9 +45,9 @@ class Reduction< Devices::Host > { public: template< typename Index, + typename Result, typename ReductionOperation, - typename DataFetcher, - typename Result = decltype( DataFetcher::operator() ) > + typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, @@ -60,9 +60,9 @@ class Reduction< Devices::MIC > { public: template< typename Index, + typename Result, typename ReductionOperation, - typename DataFetcher, - typename Result = decltype( DataFetcher::operator() ) > + typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction_impl.h index 375e6d004..b5fcabdc7 100644 --- a/src/TNL/Containers/Algorithms/Reduction_impl.h +++ b/src/TNL/Containers/Algorithms/Reduction_impl.h @@ -40,9 +40,9 @@ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//25 template< typename Index, + typename Result, typename ReductionOperation, - typename DataFetcher, - typename Result > + typename DataFetcher > Result Reduction< Devices::Cuda >:: reduce( const Index size, @@ -109,7 +109,9 @@ Reduction< Devices::Cuda >:: /*** * Transfer the reduced data from device to host. */ - ResultType resultArray[ reducedSize ]; + //ResultType* resultArray[ reducedSize ]; + //std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; + ResultType* resultArray = new ResultType[ reducedSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); #ifdef CUDA_REDUCTION_PROFILING @@ -122,14 +124,15 @@ Reduction< Devices::Cuda >:: /*** * Reduce the data on the host system. */ - auto copyFetch = [=] ( IndexType i ) { return resultArray[ i ]; }; - const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, copyFetch, zero ); + auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; }; + const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, fetch, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif - + + delete[] resultArray; return result; } else { @@ -170,9 +173,9 @@ Reduction< Devices::Cuda >:: }; template< typename Index, + typename Result, typename ReductionOperation, - typename DataFetcher, - typename Result > + typename DataFetcher > Result Reduction< Devices::Host >:: reduce( const Index size, diff --git a/src/TNL/Containers/Algorithms/VectorOperations.h b/src/TNL/Containers/Algorithms/VectorOperations.h index a3ef89fc3..1c89dbca0 100644 --- a/src/TNL/Containers/Algorithms/VectorOperations.h +++ b/src/TNL/Containers/Algorithms/VectorOperations.h @@ -11,6 +11,7 @@ #pragma once #include +#include #include #include @@ -22,7 +23,7 @@ template< typename Device > class VectorOperations{}; template<> -class VectorOperations< Devices::Host > +class VectorOperations< Devices::Host > : public CommonVectorOperations< Devices::Host > { public: template< typename Vector > @@ -36,7 +37,7 @@ public: const typename Vector::RealType& value, const Scalar thisElementMultiplicator ); - template< typename Vector, typename ResultType = typename Vector::RealType > + /*template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorMax( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > @@ -83,12 +84,13 @@ public: template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 ); + */ template< typename Vector, typename Scalar > static void vectorScalarMultiplication( Vector& v, Scalar alpha ); - template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > - static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 ); + /*template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );*/ template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 > static void addVector( Vector1& y, @@ -116,7 +118,7 @@ public: }; template<> -class VectorOperations< Devices::Cuda > +class VectorOperations< Devices::Cuda > : public CommonVectorOperations< Devices::Cuda > { public: template< typename Vector > @@ -130,7 +132,7 @@ public: const typename Vector::RealType& value, const Scalar thisElementMultiplicator ); - template< typename Vector, typename ResultType = typename Vector::RealType > + /*template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorMax( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > @@ -177,12 +179,13 @@ public: template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 ); + */ template< typename Vector, typename Scalar > static void vectorScalarMultiplication( Vector& v, const Scalar alpha ); - template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > - static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 ); + /*template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );*/ template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 > static void addVector( Vector1& y, diff --git a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h index 05f334b05..d79b5f069 100644 --- a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h +++ b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h @@ -39,7 +39,7 @@ addElement( Vector& v, v[ i ] = thisElementMultiplicator * v[ i ] + value; } -template< typename Vector, typename ResultType > +/*template< typename Vector, typename ResultType > ResultType VectorOperations< Devices::Cuda >:: getVectorMax( const Vector& v ) @@ -314,7 +314,7 @@ getVectorDifferenceSum( const Vector1& v1, v1.getSize(), v1.getData(), v2.getData() ); -} +}*/ #ifdef HAVE_CUDA template< typename Real, typename Index, typename Scalar > @@ -358,7 +358,7 @@ vectorScalarMultiplication( Vector& v, } -template< typename Vector1, typename Vector2, typename ResultType > +/*template< typename Vector1, typename Vector2, typename ResultType > ResultType VectorOperations< Devices::Cuda >:: getScalarProduct( const Vector1& v1, @@ -372,7 +372,7 @@ getScalarProduct( const Vector1& v1, v1.getSize(), v1.getData(), v2.getData() ); -} +}*/ #ifdef HAVE_CUDA template< typename Real1, typename Real2, typename Index, typename Scalar1, typename Scalar2 > diff --git a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h index d324806ab..7b1bba80c 100644 --- a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h +++ b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h @@ -41,295 +41,6 @@ addElement( Vector& v, v[ i ] = thisElementMultiplicator * v[ i ] + value; } -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorMax( const Vector& v ) -{ - using RealType = typename Vector::RealType; - using IndexType = typename Vector::IndexType; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - /*Algorithms::ParallelReductionMax< RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( RealType* ) 0 );*/ - const RealType* data = v.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ]; }; - auto reduction = [=] __cuda_callable__ ( const RealType& a, const RealType& b ) { return TNL::max( a, b ); }; - return Reduction< Devices::Host >::reduce( v.getSize(), reduction, fetch, 0.0 ); - -} - -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorMin( const Vector& v ) -{ - typedef typename Vector::RealType RealType; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - Algorithms::ParallelReductionMin< RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( RealType* ) 0 ); -} - -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorAbsMax( const Vector& v ) -{ - using RealType = typename Vector::RealType; - using IndexType = typename Vector::IndexType; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - /*Algorithms::ParallelReductionAbsMax< RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( RealType* ) 0 );*/ - - const RealType* data = v.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; - auto max = [=] __cuda_callable__ ( const RealType& a, const RealType& b ) { return TNL::max( a, b ); }; - return Reduction< Devices::Host >::reduce( v.getSize(), max, fetch, 0.0 ); -} - - -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorAbsMin( const Vector& v ) -{ - typedef typename Vector::RealType RealType; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - Algorithms::ParallelReductionAbsMin< RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( RealType* ) 0 ); -} - -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorL1Norm( const Vector& v ) -{ - typedef typename Vector::RealType RealType; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - Algorithms::ParallelReductionAbsSum< RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( RealType* ) 0 ); -} - -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorL2Norm( const Vector& v ) -{ - typedef typename Vector::RealType Real; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - Algorithms::ParallelReductionL2Norm< Real, ResultType > operation; - const ResultType result = Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( Real* ) 0 ); - return std::sqrt( result ); -} - -template< typename Vector, typename ResultType, typename Scalar > -ResultType -VectorOperations< Devices::Host >:: -getVectorLpNorm( const Vector& v, - const Scalar p ) -{ - typedef typename Vector::RealType Real; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." ); - - if( p == 1.0 ) - return getVectorL1Norm< Vector, ResultType >( v ); - if( p == 2.0 ) - return getVectorL2Norm< Vector, ResultType >( v ); - - Algorithms::ParallelReductionLpNorm< Real, ResultType, Scalar > operation; - operation.setPower( p ); - const ResultType result = Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( Real* ) 0 ); - return std::pow( result, 1.0 / p ); -} - -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorSum( const Vector& v ) -{ - typedef typename Vector::RealType Real; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - Algorithms::ParallelReductionSum< Real, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( Real* ) 0 ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceMax( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffMax< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceMin( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffMin< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceAbsMax( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffAbsMax< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceAbsMin( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffAbsMin< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceL1Norm( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffAbsSum< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceL2Norm( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffL2Norm< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - const ResultType result = Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); - return std::sqrt( result ); -} - - -template< typename Vector1, typename Vector2, typename ResultType, typename Scalar > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceLpNorm( const Vector1& v1, - const Vector2& v2, - const Scalar p ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." ); - - if( p == 1.0 ) - return getVectorDifferenceL1Norm< Vector1, Vector2, ResultType >( v1, v2 ); - if( p == 2.0 ) - return getVectorDifferenceL2Norm< Vector1, Vector2, ResultType >( v1, v2 ); - - Algorithms::ParallelReductionDiffLpNorm< typename Vector1::RealType, typename Vector2::RealType, ResultType, Scalar > operation; - operation.setPower( p ); - const ResultType result = Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); - return std::pow( result, 1.0 / p ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceSum( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffSum< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); -} template< typename Vector, typename Scalar > @@ -351,7 +62,7 @@ vectorScalarMultiplication( Vector& v, } -template< typename Vector1, typename Vector2, typename ResultType > +/*template< typename Vector1, typename Vector2, typename ResultType > ResultType VectorOperations< Devices::Host >:: getScalarProduct( const Vector1& v1, @@ -365,7 +76,7 @@ getScalarProduct( const Vector1& v1, v1.getSize(), v1.getData(), v2.getData() ); -} +}*/ template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 > void diff --git a/src/UnitTests/Containers/ArrayOperationsTest.h b/src/UnitTests/Containers/ArrayOperationsTest.h index 23b8fcd4e..9a0d24039 100644 --- a/src/UnitTests/Containers/ArrayOperationsTest.h +++ b/src/UnitTests/Containers/ArrayOperationsTest.h @@ -303,9 +303,10 @@ TYPED_TEST( ArrayOperationsTest, copyMemoryWithConversions_cuda ) ArrayOperations< Devices::Cuda >::freeMemory( deviceData2 ); } -TYPED_TEST( ArrayOperationsTest, compareMemory_cuda ) +//TYPED_TEST( ArrayOperationsTest, compareMemory_cuda ) +void Test() { - using ValueType = typename TestFixture::ValueType; + using ValueType = double;//typename TestFixture::ValueType; const int size = ARRAY_TEST_SIZE; ValueType *hostData, *deviceData, *deviceData2; @@ -445,6 +446,7 @@ TYPED_TEST( ArrayOperationsTest, containsOnlyValue_cuda ) #include "../GtestMissingError.h" int main( int argc, char* argv[] ) { + Test(); #ifdef HAVE_GTEST ::testing::InitGoogleTest( &argc, argv ); return RUN_ALL_TESTS(); -- GitLab From ec25e9a1f58b3ff1c4feed3ccdd15d0a5b489b2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 13 Apr 2019 22:01:49 +0200 Subject: [PATCH 4/9] [WIP] Deleted debugging code. --- .../Algorithms/ArrayOperationsCuda.hpp | 5 +---- .../Algorithms/CudaReductionKernel.h | 8 +++---- .../Containers/Algorithms/Reduction_impl.h | 8 +++---- .../Containers/ArrayOperationsTest.h | 22 +++++++++++++++---- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index 0570f10f4..472eb414a 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -193,7 +193,6 @@ copySTLList( DestinationElement* destination, copiedElements += copySize; } } - template< typename Element1, typename Element2, typename Index > @@ -206,9 +205,7 @@ compareMemory( const Element1* destination, TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); - Element1* d; - cudaMalloc( ( void** ) &d, size * sizeof( Element1 ) ); - auto fetch = [=] __cuda_callable__ ( Index i ) { return d[ 0 ]; }; //( destination[ i ] == source[ i ] ); }; + auto fetch = [=] __cuda_callable__ ( Index i ) { return ( destination[ i ] == source[ i ] ); }; auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }; return Reduction< Devices::Cuda >::reduce( size, diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 685a3a332..21331defe 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -72,7 +72,7 @@ CudaReductionKernel( const Result zero, * Read data into the shared memory. We start with the * sequential reduction. */ - /*while( gid + 4 * gridSize < size ) + while( gid + 4 * gridSize < size ) { sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); @@ -85,14 +85,14 @@ CudaReductionKernel( const Result zero, sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; - }*/ + } while( gid < size ) { - sdata[ tid ] = dataFetcher( gid ); //reduction( sdata[ tid ], dataFetcher( gid ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); - return; + //printf( "1: tid %d data %f \n", tid, sdata[ tid ] ); //return; diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction_impl.h index b5fcabdc7..0fee746cf 100644 --- a/src/TNL/Containers/Algorithms/Reduction_impl.h +++ b/src/TNL/Containers/Algorithms/Reduction_impl.h @@ -110,9 +110,9 @@ Reduction< Devices::Cuda >:: * Transfer the reduced data from device to host. */ //ResultType* resultArray[ reducedSize ]; - //std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; - ResultType* resultArray = new ResultType[ reducedSize ]; - ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); + std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; + //ResultType* resultArray = new ResultType[ reducedSize ]; + ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -132,7 +132,7 @@ Reduction< Devices::Cuda >:: std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif - delete[] resultArray; + //delete[] resultArray; return result; } else { diff --git a/src/UnitTests/Containers/ArrayOperationsTest.h b/src/UnitTests/Containers/ArrayOperationsTest.h index 9a0d24039..21bfccdea 100644 --- a/src/UnitTests/Containers/ArrayOperationsTest.h +++ b/src/UnitTests/Containers/ArrayOperationsTest.h @@ -303,10 +303,9 @@ TYPED_TEST( ArrayOperationsTest, copyMemoryWithConversions_cuda ) ArrayOperations< Devices::Cuda >::freeMemory( deviceData2 ); } -//TYPED_TEST( ArrayOperationsTest, compareMemory_cuda ) -void Test() +TYPED_TEST( ArrayOperationsTest, compareMemory_cuda ) { - using ValueType = double;//typename TestFixture::ValueType; + using ValueType = typename TestFixture::ValueType; const int size = ARRAY_TEST_SIZE; ValueType *hostData, *deviceData, *deviceData2; @@ -446,7 +445,22 @@ TYPED_TEST( ArrayOperationsTest, containsOnlyValue_cuda ) #include "../GtestMissingError.h" int main( int argc, char* argv[] ) { - Test(); + + using ValueType = double; + int size = 1000; + ValueType *hostData, *deviceData, *deviceData2; + ArrayOperations< Devices::Host >::allocateMemory( hostData, size ); + ArrayOperations< Devices::Cuda >::allocateMemory( deviceData, size ); + ArrayOperations< Devices::Cuda >::allocateMemory( deviceData2, size ); + + ArrayOperations< Devices::Host >::setMemory( hostData, (ValueType) 7, size ); + ArrayOperations< Devices::Cuda >::setMemory( deviceData, (ValueType) 8, size ); + ArrayOperations< Devices::Cuda >::setMemory( deviceData2, (ValueType) 9, size ); + EXPECT_FALSE(( ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< ValueType, ValueType >( hostData, deviceData, size ) )); + EXPECT_FALSE(( ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< ValueType, ValueType >( deviceData, hostData, size ) )); + EXPECT_FALSE(( ArrayOperations< Devices::Cuda >::compareMemory< ValueType, ValueType >( deviceData, deviceData2, size ) )); + + return 0; #ifdef HAVE_GTEST ::testing::InitGoogleTest( &argc, argv ); return RUN_ALL_TESTS(); -- GitLab From 0ef6688575b2474cda5bec9349810a34390572fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sun, 14 Apr 2019 10:36:43 +0200 Subject: [PATCH 5/9] Parallel reduction is working. --- .../Algorithms/ArrayOperationsCuda.hpp | 6 +----- .../Containers/Algorithms/CudaReductionKernel.h | 7 ++----- src/UnitTests/Containers/ArrayOperationsTest.h | 16 ---------------- 3 files changed, 3 insertions(+), 26 deletions(-) diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index 472eb414a..a12b9c67f 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -207,11 +207,7 @@ compareMemory( const Element1* destination, auto fetch = [=] __cuda_callable__ ( Index i ) { return ( destination[ i ] == source[ i ] ); }; auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }; - return Reduction< Devices::Cuda >::reduce( - size, - reduction, //[=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }, - fetch, //[=] __cuda_callable__ ( Index i ) { return destination[ i ]; }, - true ); + return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); /*Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );*/ diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 21331defe..8fea90f8f 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -47,8 +47,8 @@ template< int blockSize, __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionKernel( const Result zero, - const DataFetcher& dataFetcher, - const Reduction& reduction, + const DataFetcher dataFetcher, + const Reduction reduction, const Index size, Result* output ) { @@ -94,8 +94,6 @@ CudaReductionKernel( const Result zero, __syncthreads(); //printf( "1: tid %d data %f \n", tid, sdata[ tid ] ); - - //return; /*** * Perform the parallel reduction. */ @@ -127,7 +125,6 @@ CudaReductionKernel( const Result zero, //printf( "3: tid %d data %f \n", tid, sdata[ tid ] ); } - /*** * This runs in one warp so it is synchronized implicitly. */ diff --git a/src/UnitTests/Containers/ArrayOperationsTest.h b/src/UnitTests/Containers/ArrayOperationsTest.h index 21bfccdea..23b8fcd4e 100644 --- a/src/UnitTests/Containers/ArrayOperationsTest.h +++ b/src/UnitTests/Containers/ArrayOperationsTest.h @@ -445,22 +445,6 @@ TYPED_TEST( ArrayOperationsTest, containsOnlyValue_cuda ) #include "../GtestMissingError.h" int main( int argc, char* argv[] ) { - - using ValueType = double; - int size = 1000; - ValueType *hostData, *deviceData, *deviceData2; - ArrayOperations< Devices::Host >::allocateMemory( hostData, size ); - ArrayOperations< Devices::Cuda >::allocateMemory( deviceData, size ); - ArrayOperations< Devices::Cuda >::allocateMemory( deviceData2, size ); - - ArrayOperations< Devices::Host >::setMemory( hostData, (ValueType) 7, size ); - ArrayOperations< Devices::Cuda >::setMemory( deviceData, (ValueType) 8, size ); - ArrayOperations< Devices::Cuda >::setMemory( deviceData2, (ValueType) 9, size ); - EXPECT_FALSE(( ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< ValueType, ValueType >( hostData, deviceData, size ) )); - EXPECT_FALSE(( ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< ValueType, ValueType >( deviceData, hostData, size ) )); - EXPECT_FALSE(( ArrayOperations< Devices::Cuda >::compareMemory< ValueType, ValueType >( deviceData, deviceData2, size ) )); - - return 0; #ifdef HAVE_GTEST ::testing::InitGoogleTest( &argc, argv ); return RUN_ALL_TESTS(); -- GitLab From 4b52cc2bcfdf4ae836f6b831fb1ebb3ddbc44e38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sun, 14 Apr 2019 22:43:35 +0200 Subject: [PATCH 6/9] Rewritting lambdas with references. --- .../Algorithms/ArrayOperationsCuda.hpp | 29 +- .../Algorithms/CommonVectorOperations.hpp | 375 ++++++++++++++++++ .../Algorithms/CudaReductionKernel.h | 72 ++-- src/TNL/Containers/Algorithms/Reduction.h | 8 +- .../{Reduction_impl.h => Reduction.hpp} | 78 ++-- 5 files changed, 465 insertions(+), 97 deletions(-) create mode 100644 src/TNL/Containers/Algorithms/CommonVectorOperations.hpp rename src/TNL/Containers/Algorithms/{Reduction_impl.h => Reduction.hpp} (74%) diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index a12b9c67f..bede985bc 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -205,12 +205,10 @@ compareMemory( const Element1* destination, TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); - auto fetch = [=] __cuda_callable__ ( Index i ) { return ( destination[ i ] == source[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); - - /*Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; - return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );*/ + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( destination[ i ] == source[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; + //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); } template< typename Element, @@ -225,9 +223,10 @@ containsValue( const Element* data, TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; - auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; - auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a || b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false ); + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; + auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; }; + //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false ); } template< typename Element, @@ -243,14 +242,10 @@ containsOnlyValue( const Element* data, if( size == 0 ) return false; if( size == 0 ) return false; - auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; - auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); - - - /*Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; - reductionContainsOnlyValue.setValue( value ); - return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );*/ + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; + auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; + //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); } diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp new file mode 100644 index 000000000..1dc43b2c7 --- /dev/null +++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp @@ -0,0 +1,375 @@ +/*************************************************************************** + CommonVectorOperations.hpp - description + ------------------- + begin : Apr 12, 2019 + copyright : (C) 2019 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include + +namespace TNL { +namespace Containers { +namespace Algorithms { + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorMax( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorMin( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorAbsMax( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorAbsMin( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorL1Norm( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorL2Norm( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return std::sqrt( Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); +} + +template< typename Device > + template< typename Vector, typename ResultType, typename Scalar > +ResultType +CommonVectorOperations< Device >:: +getVectorLpNorm( const Vector& v, + const Scalar p ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + if( p == 1.0 ) + return getVectorL1Norm< Vector, ResultType >( v ); + if( p == 2.0 ) + return getVectorL2Norm< Vector, ResultType >( v ); + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return std::pow( Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorSum( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + if( std::is_same< ResultType, bool >::value ) + abort(); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceMax( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceMin( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceAbsMax( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceAbsMin( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceL1Norm( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceL2Norm( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { + auto diff = data1[ i ] - data2[ i ]; + return diff * diff; + }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return std::sqrt( Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType, typename Scalar > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceLpNorm( const Vector1& v1, + const Vector2& v2, + const Scalar p ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." ); + + if( p == 1.0 ) + return getVectorDifferenceL1Norm< Vector1, Vector2, ResultType >( v1, v2 ); + if( p == 2.0 ) + return getVectorDifferenceL2Norm< Vector1, Vector2, ResultType >( v1, v2 ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return std::pow( Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceSum( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getScalarProduct( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +} // namespace Algorithms +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 8fea90f8f..a90e0a3aa 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -43,12 +43,14 @@ template< int blockSize, typename Result, typename DataFetcher, typename Reduction, + typename VolatileReduction, typename Index > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionKernel( const Result zero, const DataFetcher dataFetcher, const Reduction reduction, + const VolatileReduction volatileReduction, const Index size, Result* output ) { @@ -74,21 +76,21 @@ CudaReductionKernel( const Result zero, */ while( gid + 4 * gridSize < size ) { - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); + reduction( sdata[ tid ], dataFetcher( gid ) ); + reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); + reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); + reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); + reduction( sdata[ tid ], dataFetcher( gid ) ); + reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; } while( gid < size ) { - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); + reduction( sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); @@ -100,19 +102,19 @@ CudaReductionKernel( const Result zero, if( blockSize >= 1024 ) { if( tid < 512 ) - sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] ); + reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) - sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] ); + reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) - sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] ); + reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); //printf( "2: tid %d data %f \n", tid, sdata[ tid ] ); } @@ -120,7 +122,7 @@ CudaReductionKernel( const Result zero, if( blockSize >= 128 ) { if( tid < 64 ) - sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] ); + reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); //printf( "3: tid %d data %f \n", tid, sdata[ tid ] ); } @@ -133,34 +135,34 @@ CudaReductionKernel( const Result zero, volatile ResultType* vsdata = sdata; if( blockSize >= 64 ) { - vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 32 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); //printf( "4: tid %d data %f \n", tid, sdata[ tid ] ); } // TODO: If blocksize == 32, the following does not work // We do not check if tid < 16. Fix it!!! if( blockSize >= 32 ) { - vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 16 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); //printf( "5: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 16 ) { - vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 8 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); //printf( "6: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 8 ) { - vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 4 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); //printf( "7: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 4 ) { - vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 2 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); //printf( "8: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 2 ) { - vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 1 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); //printf( "9: tid %d data %f \n", tid, sdata[ tid ] ); } } @@ -178,11 +180,13 @@ CudaReductionKernel( const Result zero, template< typename DataFetcher, typename Reduction, + typename VolatileReduction, typename Index, typename Result > int CudaReductionKernelLauncher( const Index size, const Reduction& reduction, + const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, Result*& output ) @@ -226,55 +230,55 @@ CudaReductionKernelLauncher( const Index size, { case 512: CudaReductionKernel< 512 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 256: - cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 256 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 128: - cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 128 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 64: - cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 64 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 32: - cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 32 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 16: - cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 16 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 8: - cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 8 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 4: - cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 4 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 2: - cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 2 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 1: TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h index ece9016c5..7ac0d4c02 100644 --- a/src/TNL/Containers/Algorithms/Reduction.h +++ b/src/TNL/Containers/Algorithms/Reduction.h @@ -32,10 +32,12 @@ public: template< typename Index, typename Result, typename ReductionOperation, + typename VolatileReductionOperation, typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); }; @@ -47,10 +49,12 @@ public: template< typename Index, typename Result, typename ReductionOperation, + typename VolatileReductionOperation, typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); }; @@ -62,10 +66,12 @@ public: template< typename Index, typename Result, typename ReductionOperation, + typename VolatileReductionOperation, typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); }; @@ -74,4 +80,4 @@ public: } // namespace Containers } // namespace TNL -#include "Reduction_impl.h" +#include "Reduction.hpp" diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction.hpp similarity index 74% rename from src/TNL/Containers/Algorithms/Reduction_impl.h rename to src/TNL/Containers/Algorithms/Reduction.hpp index 0fee746cf..22121de25 100644 --- a/src/TNL/Containers/Algorithms/Reduction_impl.h +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -42,11 +42,13 @@ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//25 template< typename Index, typename Result, typename ReductionOperation, + typename VolatileReductionOperation, typename DataFetcher > Result Reduction< Devices::Cuda >:: reduce( const Index size, ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ) { @@ -94,10 +96,11 @@ Reduction< Devices::Cuda >:: */ ResultType* deviceAux1( 0 ); IndexType reducedSize = CudaReductionKernelLauncher( size, - reduction, - dataFetcher, - zero, - deviceAux1 ); + reduction, + volatileReduction, + dataFetcher, + zero, + deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; @@ -111,7 +114,6 @@ Reduction< Devices::Cuda >:: */ //ResultType* resultArray[ reducedSize ]; std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; - //ResultType* resultArray = new ResultType[ reducedSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize ); #ifdef CUDA_REDUCTION_PROFILING @@ -125,28 +127,26 @@ Reduction< Devices::Cuda >:: * Reduce the data on the host system. */ auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; }; - const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, fetch, zero ); + const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif - - //delete[] resultArray; return result; } else { /*** * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. */ - //LaterReductionOperation laterReductionOperation; auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; }; while( reducedSize > 1 ) { reducedSize = CudaReductionKernelLauncher( reducedSize, - reduction, - copyFetch, - zero, - deviceAux1 ); + reduction, + volatileReduction, + copyFetch, + zero, + deviceAux1 ); } #ifdef CUDA_REDUCTION_PROFILING @@ -175,11 +175,13 @@ Reduction< Devices::Cuda >:: template< typename Index, typename Result, typename ReductionOperation, + typename VolatileReductionOperation, typename DataFetcher > Result Reduction< Devices::Host >:: reduce( const Index size, ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ) { @@ -202,14 +204,10 @@ Reduction< Devices::Host >:: for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { - r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); - r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); - r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); - r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); - /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 ); - operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 ); - operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 ); - operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/ + reduction( r[ 0 ], dataFetcher( offset + i ) ); + reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); + reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); + reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); } } @@ -217,19 +215,18 @@ Reduction< Devices::Host >:: #pragma omp single nowait { for( IndexType i = blocks * block_size; i < size; i++ ) - r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); - //operation.dataFetcher( r[ 0 ], i, input1, input2 ); + reduction( r[ 0 ], dataFetcher( i ) ); } // local reduction of unrolled results - r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); - r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); - r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); + reduction( r[ 0 ], r[ 2 ] ); + reduction( r[ 1 ], r[ 3 ] ); + reduction( r[ 0 ], r[ 1 ] ); // inter-thread reduction of local results #pragma omp critical { - result = reduction( result, r[ 0 ] ); + reduction( result, r[ 0 ] ); } } return result; @@ -244,37 +241,28 @@ Reduction< Devices::Host >:: for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { - r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); - r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); - r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); - r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); - /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 ); - operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 ); - operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 ); - operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/ + reduction( r[ 0 ], dataFetcher( offset + i ) ); + reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); + reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); + reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); } } // reduction of the last, incomplete block (not unrolled) for( IndexType i = blocks * block_size; i < size; i++ ) - r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); + reduction( r[ 0 ], dataFetcher( i ) ); //operation.dataFetcher( r[ 0 ], i, input1, input2 ); // reduction of unrolled results - r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); - r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); - r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); - - /*operation.reduction( r[ 0 ], r[ 2 ] ); - operation.reduction( r[ 1 ], r[ 3 ] ); - operation.reduction( r[ 0 ], r[ 1 ] );*/ - + reduction( r[ 0 ], r[ 2 ] ); + reduction( r[ 1 ], r[ 3 ] ); + reduction( r[ 0 ], r[ 1 ] ); return r[ 0 ]; } else { ResultType result = zero; for( IndexType i = 0; i < size; i++ ) - result = reduction( result, dataFetcher( i ) ); + reduction( result, dataFetcher( i ) ); return result; } #ifdef HAVE_OPENMP -- GitLab From aa18c0a2925e30fa37f7455351bce217a7faa944 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sun, 14 Apr 2019 23:01:09 +0200 Subject: [PATCH 7/9] Changing numeric limits -max to lowest. --- src/TNL/Containers/Algorithms/CommonVectorOperations.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp index 1dc43b2c7..5430f6eac 100644 --- a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp +++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp @@ -32,7 +32,7 @@ getVectorMax( const Vector& v ) auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; - return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() ); + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > @@ -68,7 +68,7 @@ getVectorAbsMax( const Vector& v ) auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; - return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() ); + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > @@ -189,7 +189,7 @@ getVectorDifferenceMax( const Vector1& v1, auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; - return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() ); + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > @@ -231,7 +231,7 @@ getVectorDifferenceAbsMax( const Vector1& v1, auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; - return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() ); + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > -- GitLab From 58af5093d15740f6bf3e3cb2c9715161654f64c8 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Mon, 15 Apr 2019 13:40:51 +0200 Subject: [PATCH 8/9] Fixing CUDA array operations. --- src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index bede985bc..0badc8d79 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -208,7 +208,7 @@ compareMemory( const Element1* destination, auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( destination[ i ] == source[ i ] ); }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); + return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); } template< typename Element, @@ -226,7 +226,7 @@ containsValue( const Element* data, auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; }; - //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false ); + return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false ); } template< typename Element, @@ -245,7 +245,7 @@ containsOnlyValue( const Element* data, auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); + return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); } -- GitLab From 3f891223ca0065800b1952252d2f179f3853310b Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Mon, 15 Apr 2019 16:04:24 +0200 Subject: [PATCH 9/9] Fixed use of buffer in CUDA paralell reduction. --- .../Algorithms/CommonVectorOperations.hpp | 4 +- .../Algorithms/CudaReductionKernel.h | 250 +++++++++++------- src/TNL/Containers/Algorithms/Reduction.hpp | 25 +- 3 files changed, 165 insertions(+), 114 deletions(-) diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp index 5430f6eac..ef3317cb7 100644 --- a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp +++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp @@ -24,7 +24,7 @@ CommonVectorOperations< Device >:: getVectorMax( const Vector& v ) { TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - + using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; @@ -42,7 +42,7 @@ CommonVectorOperations< Device >:: getVectorMin( const Vector& v ) { TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - + using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index a90e0a3aa..8701b8b7d 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -56,7 +56,7 @@ CudaReductionKernel( const Result zero, { using IndexType = Index; using ResultType = Result; - + ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); /*** @@ -178,22 +178,14 @@ CudaReductionKernel( const Result zero, } -template< typename DataFetcher, - typename Reduction, - typename VolatileReduction, - typename Index, - typename Result > -int -CudaReductionKernelLauncher( const Index size, - const Reduction& reduction, - const VolatileReduction& volatileReduction, - const DataFetcher& dataFetcher, - const Result& zero, - Result*& output ) +template< typename Index, + typename Result > +struct CudaReductionKernelLauncher { using IndexType = Index; using ResultType = Result; + //// // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per @@ -203,93 +195,159 @@ CudaReductionKernelLauncher( const Index size, // where blocksPerMultiprocessor is determined according to the number of // available registers on the multiprocessor. // On Tesla K40c, desGridSize = 8 * 15 = 120. - const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice(); - const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) - / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ); - const int desGridSize = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ); - dim3 blockSize, gridSize; - blockSize.x = Reduction_maxThreadsPerBlock; - gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); - - // create reference to the reduction buffer singleton and set size - const size_t buf_size = desGridSize * sizeof( ResultType ); - CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); - cudaReductionBuffer.setSize( buf_size ); - output = cudaReductionBuffer.template getData< ResultType >(); - - // when there is only one warp per blockSize.x, we need to allocate two warps - // worth of shared memory so that we don't index shared memory out of bounds - const IndexType shmem = (blockSize.x <= 32) - ? 2 * blockSize.x * sizeof( ResultType ) - : blockSize.x * sizeof( ResultType ); + CudaReductionKernelLauncher( const Index size ) + : activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ), + blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) + / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ), + desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ), + originalSize( size ) + { + } - /*** - * Depending on the blockSize we generate appropriate template instance. - */ - switch( blockSize.x ) + template< typename DataFetcher, + typename Reduction, + typename VolatileReduction > + int start( const Reduction& reduction, + const VolatileReduction& volatileReduction, + const DataFetcher& dataFetcher, + const Result& zero, + ResultType*& output ) { - case 512: - CudaReductionKernel< 512 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 256: - cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 256 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 128: - cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 128 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 64: - cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 64 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 32: - cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 32 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 16: - cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 16 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 8: - cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 8 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 4: - cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 4 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 2: - cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 2 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 1: - TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); - default: - TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + //// + // create reference to the reduction buffer singleton and set size + const size_t buf_size = 2 * desGridSize * sizeof( ResultType ); + CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); + cudaReductionBuffer.setSize( buf_size ); + output = cudaReductionBuffer.template getData< ResultType >(); + + this-> reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output ); + return this->reducedSize; } - TNL_CHECK_CUDA_DEVICE; - // return the size of the output array on the CUDA device - return gridSize.x; -} + template< typename Reduction, + typename VolatileReduction > + Result finish( const Reduction& reduction, + const VolatileReduction& volatileReduction, + const Result& zero ) + { + //// + // Input is the first half of the buffer, output is the second half + const size_t buf_size = desGridSize * sizeof( ResultType ); + CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); + ResultType* input = cudaReductionBuffer.template getData< ResultType >(); + ResultType* output = &input[ buf_size ]; + + auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; + while( this->reducedSize > 1 ) + { + this-> reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output ); + std::swap( input, output ); + } + + //// + // Copy result on CPU + ResultType result; + ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( &result, output, 1 ); + return result; + } + + protected: + template< typename DataFetcher, + typename Reduction, + typename VolatileReduction > + int launch( const Index size, + const Reduction& reduction, + const VolatileReduction& volatileReduction, + const DataFetcher& dataFetcher, + const Result& zero, + Result* output ) + { + dim3 blockSize, gridSize; + blockSize.x = Reduction_maxThreadsPerBlock; + gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); + + //// + // when there is only one warp per blockSize.x, we need to allocate two warps + // worth of shared memory so that we don't index shared memory out of bounds + const IndexType shmem = (blockSize.x <= 32) + ? 2 * blockSize.x * sizeof( ResultType ) + : blockSize.x * sizeof( ResultType ); + + /*** + * Depending on the blockSize we generate appropriate template instance. + */ + switch( blockSize.x ) + { + case 512: + CudaReductionKernel< 512 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 256: + cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 256 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 128: + cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 128 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 64: + cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 64 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 32: + cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 32 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 16: + cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 16 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 8: + cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 8 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 4: + cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 4 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 2: + cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 2 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 1: + TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); + default: + TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + } + TNL_CHECK_CUDA_DEVICE; + + //// + // return the size of the output array on the CUDA device + return gridSize.x; + } + + const int activeDevice; + const int blocksdPerMultiprocessor; + const int desGridSize; + const IndexType originalSize; + IndexType reducedSize; +}; #endif } // namespace Algorithms diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp index 22121de25..31c93504c 100644 --- a/src/TNL/Containers/Algorithms/Reduction.hpp +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -91,11 +91,13 @@ Reduction< Devices::Cuda >:: timer.start(); #endif + CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size ); + /**** * Reduce the data on the CUDA device. */ ResultType* deviceAux1( 0 ); - IndexType reducedSize = CudaReductionKernelLauncher( size, + IndexType reducedSize = reductionLauncher.start( reduction, volatileReduction, dataFetcher, @@ -112,7 +114,6 @@ Reduction< Devices::Cuda >:: /*** * Transfer the reduced data from device to host. */ - //ResultType* resultArray[ reducedSize ]; std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize ); @@ -139,15 +140,7 @@ Reduction< Devices::Cuda >:: /*** * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. */ - auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; }; - while( reducedSize > 1 ) { - reducedSize = CudaReductionKernelLauncher( reducedSize, - reduction, - volatileReduction, - copyFetch, - zero, - deviceAux1 ); - } + auto result = reductionLauncher.finish( reduction, volatileReduction, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -156,14 +149,14 @@ Reduction< Devices::Cuda >:: timer.start(); #endif - ResultType resultArray[ 1 ]; - ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); - const ResultType result = resultArray[ 0 ]; + //ResultType resultArray[ 1 ]; + //ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); + //const ResultType result = resultArray[ 0 ]; - #ifdef CUDA_REDUCTION_PROFILING + /*#ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl; - #endif + #endif*/ return result; } -- GitLab