Loading src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -203,7 +203,7 @@ compareMemory( const Element1* destination, //TODO: The parallel reduction on the CUDA device with different element types is needed. bool result = false; Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; reductionOnCudaDevice( reductionEqualities, size, destination, source, result ); Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source, result ); return result; } Loading src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +17 −17 Original line number Diff line number Diff line Loading @@ -84,21 +84,21 @@ CudaMultireductionKernel( Operation operation, sdata[ tid ] = operation.initialValue(); while( gid + 4 * gridSizeX < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 ); operation.firstReduction( sdata[ tid ], gid, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 ); gid += 4 * gridSizeX; } while( gid + 2 * gridSizeX < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); operation.firstReduction( sdata[ tid ], gid, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); gid += 2 * gridSizeX; } while( gid < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.firstReduction( sdata[ tid ], gid, input1, input2 ); gid += gridSizeX; } __syncthreads(); Loading @@ -111,25 +111,25 @@ CudaMultireductionKernel( Operation operation, */ if( blockSizeX >= 1024 ) { if( threadIdx.x < 512 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] ); } __syncthreads(); } if( blockSizeX >= 512 ) { if( threadIdx.x < 256 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] ); } __syncthreads(); } if( blockSizeX >= 256 ) { if( threadIdx.x < 128 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] ); } __syncthreads(); } if( blockSizeX >= 128 ) { if( threadIdx.x < 64 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] ); } __syncthreads(); } Loading @@ -144,22 +144,22 @@ CudaMultireductionKernel( Operation operation, if( threadIdx.x < 32 ) { volatile ResultType* vsdata = sdata; if( blockSizeX >= 64 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); } if( blockSizeX >= 32 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); } if( blockSizeX >= 16 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); } if( blockSizeX >= 8 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); } if( blockSizeX >= 4 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); } if( blockSizeX >= 2 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); } } Loading src/TNL/Containers/Algorithms/CudaReductionKernel.h +17 −17 Original line number Diff line number Diff line Loading @@ -69,21 +69,21 @@ CudaReductionKernel( Operation operation, */ while( gid + 4 * gridSize < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 ); operation.firstReduction( sdata[ tid ], gid, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); operation.firstReduction( sdata[ tid ], gid, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); gid += 2 * gridSize; } while( gid < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.firstReduction( sdata[ tid ], gid, input1, input2 ); gid += gridSize; } __syncthreads(); Loading @@ -98,19 +98,19 @@ CudaReductionKernel( Operation operation, if( blockSize >= 1024 ) { if( tid < 512 ) operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); //printf( "2: tid %d data %f \n", tid, sdata[ tid ] ); } Loading @@ -118,7 +118,7 @@ CudaReductionKernel( Operation operation, if( blockSize >= 128 ) { if( tid < 64 ) operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); //printf( "3: tid %d data %f \n", tid, sdata[ tid ] ); } Loading @@ -132,34 +132,34 @@ CudaReductionKernel( Operation operation, volatile ResultType* vsdata = sdata; if( blockSize >= 64 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); //printf( "4: tid %d data %f \n", tid, sdata[ tid ] ); } // TODO: If blocksize == 32, the following does not work // We do not check if tid < 16. Fix it!!! if( blockSize >= 32 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); //printf( "5: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 16 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); //printf( "6: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 8 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); //printf( "7: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 4 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); //printf( "8: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 2 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); //printf( "9: tid %d data %f \n", tid, sdata[ tid ] ); } } Loading src/TNL/Containers/Algorithms/Multireduction_impl.h +5 −5 Original line number Diff line number Diff line Loading @@ -206,7 +206,7 @@ reduce( Operation& operation, for( int k = 0; k < n; k++ ) { const DataType1* _input1 = input1 + k * ldInput1; for( IndexType i = 0; i < block_size; i++ ) r[ k ] = operation.reduceOnHost( offset + i, r[ k ], _input1, input2 ); operation.firstReduction( r[ k ], offset + i, _input1, input2 ); } } Loading @@ -216,7 +216,7 @@ reduce( Operation& operation, for( int k = 0; k < n; k++ ) { const DataType1* _input1 = input1 + k * ldInput1; for( IndexType i = blocks * block_size; i < size; i++ ) r[ k ] = operation.reduceOnHost( i, r[ k ], _input1, input2 ); operation.firstReduction( r[ k ], i, _input1, input2 ); } } Loading @@ -224,7 +224,7 @@ reduce( Operation& operation, #pragma omp critical { for( int k = 0; k < n; k++ ) operation.commonReductionOnDevice( result[ k ], r[ k ] ); operation.commonReduction( result[ k ], r[ k ] ); } } else { Loading @@ -237,14 +237,14 @@ reduce( Operation& operation, for( int k = 0; k < n; k++ ) { const DataType1* _input1 = input1 + k * ldInput1; for( IndexType i = 0; i < block_size; i++ ) result[ k ] = operation.reduceOnHost( offset + i, result[ k ], _input1, input2 ); operation.firstReduction( result[ k ], offset + i, _input1, input2 ); } } for( int k = 0; k < n; k++ ) { const DataType1* _input1 = input1 + k * ldInput1; for( IndexType i = blocks * block_size; i < size; i++ ) result[ k ] = operation.reduceOnHost( i, result[ k ], _input1, input2 ); operation.firstReduction( result[ k ], i, _input1, input2 ); } #ifdef HAVE_OPENMP } Loading src/TNL/Containers/Algorithms/Reduction.h +50 −15 Original line number Diff line number Diff line Loading @@ -8,29 +8,64 @@ /* See Copyright Notice in tnl/Copyright */ // Implemented by: Tomas Oberhuber, Jakub Klinkovsky #pragma once #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Devices/MIC.h> namespace TNL { namespace Containers { namespace Algorithms { // TODO: rename to // template< typename Device > // class Reduction // {}; // // and make a specialization for Devices::Host (as it is done in Multireduction.h) // It should be as fast as all the manual implementations in VectorOperations. template< typename Device > class Reduction { }; template<> class Reduction< Devices::Cuda > { public: template< typename Operation, typename Index > static bool reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); }; template<> class Reduction< Devices::Host > { public: template< typename Operation, typename Index > static bool reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); }; template<> class Reduction< Devices::MIC > { public: template< typename Operation, typename Index > bool reductionOnCudaDevice( const Operation& operation, static bool reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); }; } // namespace Algorithms } // namespace Containers } // namespace TNL #include <TNL/Containers/Algorithms/Reduction_impl.h> #include "Reduction_impl.h" Loading
src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -203,7 +203,7 @@ compareMemory( const Element1* destination, //TODO: The parallel reduction on the CUDA device with different element types is needed. bool result = false; Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; reductionOnCudaDevice( reductionEqualities, size, destination, source, result ); Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source, result ); return result; } Loading
src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +17 −17 Original line number Diff line number Diff line Loading @@ -84,21 +84,21 @@ CudaMultireductionKernel( Operation operation, sdata[ tid ] = operation.initialValue(); while( gid + 4 * gridSizeX < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 ); operation.firstReduction( sdata[ tid ], gid, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 ); gid += 4 * gridSizeX; } while( gid + 2 * gridSizeX < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); operation.firstReduction( sdata[ tid ], gid, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); gid += 2 * gridSizeX; } while( gid < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.firstReduction( sdata[ tid ], gid, input1, input2 ); gid += gridSizeX; } __syncthreads(); Loading @@ -111,25 +111,25 @@ CudaMultireductionKernel( Operation operation, */ if( blockSizeX >= 1024 ) { if( threadIdx.x < 512 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] ); } __syncthreads(); } if( blockSizeX >= 512 ) { if( threadIdx.x < 256 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] ); } __syncthreads(); } if( blockSizeX >= 256 ) { if( threadIdx.x < 128 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] ); } __syncthreads(); } if( blockSizeX >= 128 ) { if( threadIdx.x < 64 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] ); } __syncthreads(); } Loading @@ -144,22 +144,22 @@ CudaMultireductionKernel( Operation operation, if( threadIdx.x < 32 ) { volatile ResultType* vsdata = sdata; if( blockSizeX >= 64 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); } if( blockSizeX >= 32 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); } if( blockSizeX >= 16 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); } if( blockSizeX >= 8 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); } if( blockSizeX >= 4 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); } if( blockSizeX >= 2 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); } } Loading
src/TNL/Containers/Algorithms/CudaReductionKernel.h +17 −17 Original line number Diff line number Diff line Loading @@ -69,21 +69,21 @@ CudaReductionKernel( Operation operation, */ while( gid + 4 * gridSize < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 ); operation.firstReduction( sdata[ tid ], gid, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); operation.firstReduction( sdata[ tid ], gid, input1, input2 ); operation.firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); gid += 2 * gridSize; } while( gid < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.firstReduction( sdata[ tid ], gid, input1, input2 ); gid += gridSize; } __syncthreads(); Loading @@ -98,19 +98,19 @@ CudaReductionKernel( Operation operation, if( blockSize >= 1024 ) { if( tid < 512 ) operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); //printf( "2: tid %d data %f \n", tid, sdata[ tid ] ); } Loading @@ -118,7 +118,7 @@ CudaReductionKernel( Operation operation, if( blockSize >= 128 ) { if( tid < 64 ) operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] ); operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); //printf( "3: tid %d data %f \n", tid, sdata[ tid ] ); } Loading @@ -132,34 +132,34 @@ CudaReductionKernel( Operation operation, volatile ResultType* vsdata = sdata; if( blockSize >= 64 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); //printf( "4: tid %d data %f \n", tid, sdata[ tid ] ); } // TODO: If blocksize == 32, the following does not work // We do not check if tid < 16. Fix it!!! if( blockSize >= 32 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); //printf( "5: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 16 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); //printf( "6: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 8 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); //printf( "7: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 4 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); //printf( "8: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 2 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] ); operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); //printf( "9: tid %d data %f \n", tid, sdata[ tid ] ); } } Loading
src/TNL/Containers/Algorithms/Multireduction_impl.h +5 −5 Original line number Diff line number Diff line Loading @@ -206,7 +206,7 @@ reduce( Operation& operation, for( int k = 0; k < n; k++ ) { const DataType1* _input1 = input1 + k * ldInput1; for( IndexType i = 0; i < block_size; i++ ) r[ k ] = operation.reduceOnHost( offset + i, r[ k ], _input1, input2 ); operation.firstReduction( r[ k ], offset + i, _input1, input2 ); } } Loading @@ -216,7 +216,7 @@ reduce( Operation& operation, for( int k = 0; k < n; k++ ) { const DataType1* _input1 = input1 + k * ldInput1; for( IndexType i = blocks * block_size; i < size; i++ ) r[ k ] = operation.reduceOnHost( i, r[ k ], _input1, input2 ); operation.firstReduction( r[ k ], i, _input1, input2 ); } } Loading @@ -224,7 +224,7 @@ reduce( Operation& operation, #pragma omp critical { for( int k = 0; k < n; k++ ) operation.commonReductionOnDevice( result[ k ], r[ k ] ); operation.commonReduction( result[ k ], r[ k ] ); } } else { Loading @@ -237,14 +237,14 @@ reduce( Operation& operation, for( int k = 0; k < n; k++ ) { const DataType1* _input1 = input1 + k * ldInput1; for( IndexType i = 0; i < block_size; i++ ) result[ k ] = operation.reduceOnHost( offset + i, result[ k ], _input1, input2 ); operation.firstReduction( result[ k ], offset + i, _input1, input2 ); } } for( int k = 0; k < n; k++ ) { const DataType1* _input1 = input1 + k * ldInput1; for( IndexType i = blocks * block_size; i < size; i++ ) result[ k ] = operation.reduceOnHost( i, result[ k ], _input1, input2 ); operation.firstReduction( result[ k ], i, _input1, input2 ); } #ifdef HAVE_OPENMP } Loading
src/TNL/Containers/Algorithms/Reduction.h +50 −15 Original line number Diff line number Diff line Loading @@ -8,29 +8,64 @@ /* See Copyright Notice in tnl/Copyright */ // Implemented by: Tomas Oberhuber, Jakub Klinkovsky #pragma once #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Devices/MIC.h> namespace TNL { namespace Containers { namespace Algorithms { // TODO: rename to // template< typename Device > // class Reduction // {}; // // and make a specialization for Devices::Host (as it is done in Multireduction.h) // It should be as fast as all the manual implementations in VectorOperations. template< typename Device > class Reduction { }; template<> class Reduction< Devices::Cuda > { public: template< typename Operation, typename Index > static bool reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); }; template<> class Reduction< Devices::Host > { public: template< typename Operation, typename Index > static bool reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); }; template<> class Reduction< Devices::MIC > { public: template< typename Operation, typename Index > bool reductionOnCudaDevice( const Operation& operation, static bool reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); }; } // namespace Algorithms } // namespace Containers } // namespace TNL #include <TNL/Containers/Algorithms/Reduction_impl.h> #include "Reduction_impl.h"