Loading src/TNL/Containers/Algorithms/ArrayAssignment.h +0 −2 Original line number Diff line number Diff line Loading @@ -86,8 +86,6 @@ struct ArrayAssignment< Array, T, false > }; } // namespace Algorithms } // namespace Containers } // namespace TNL src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +26 −8 Original line number Diff line number Diff line Loading @@ -205,8 +205,19 @@ compareMemory( const Element1* destination, { TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source ); Element1* d; cudaMalloc( ( void** ) &d, size * sizeof( Element1 ) ); auto fetch = [=] __cuda_callable__ ( Index i ) { return d[ 0 ]; }; //( destination[ i ] == source[ i ] ); }; auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, //[=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }, fetch, //[=] __cuda_callable__ ( Index i ) { return destination[ i ]; }, true ); /*Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );*/ } template< typename Element, Loading @@ -219,11 +230,11 @@ containsValue( const Element* data, { TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; //Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; //reductionContainsValue.setValue( value ); auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; return Reduction< Devices::Cuda >::reduce( size, TNL::sum, fetch, 0.0 ); auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a || b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false ); } template< typename Element, Loading @@ -237,9 +248,16 @@ containsOnlyValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; if( size == 0 ) return false; auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); /*Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; reductionContainsOnlyValue.setValue( value ); return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr ); return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );*/ } Loading src/TNL/Containers/Algorithms/CommonVectorOperations.h 0 → 100644 +79 −0 Original line number Diff line number Diff line /*************************************************************************** CommonVectorOperations.h - description ------------------- begin : Apr 12, 2019 copyright : (C) 2019 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once namespace TNL { namespace Containers { namespace Algorithms { template< typename Device > struct CommonVectorOperations { using DeviceType = Device; template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorMax( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorMin( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorAbsMax( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorAbsMin( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorL1Norm( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorL2Norm( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType, typename Scalar > static ResultType getVectorLpNorm( const Vector& v, const Scalar p ); template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorSum( const Vector& v ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceMax( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceMin( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceAbsMax( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceAbsMin( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceL1Norm( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceL2Norm( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType, typename Scalar > static ResultType getVectorDifferenceLpNorm( const Vector1& v1, const Vector2& v2, const Scalar p ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 ); }; } // namespace Algorithms } // namespace Containers } // namespace TNL #include <TNL/Containers/Algorithms/CommonVectorOperations.hpp> src/TNL/Containers/Algorithms/CudaReductionKernel.h +29 −33 Original line number Diff line number Diff line Loading @@ -40,20 +40,18 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi #endif template< int blockSize, typename Real, typename Result, typename DataFetcher, typename Reduction, typename Index, typename Result = decltype( std::declval< DataFetcher >( 0,0 ) ) > typename Index > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionKernel( const Real& zero, CudaReductionKernel( const Result zero, const DataFetcher& dataFetcher, const Reduction& reduction, const Index size, Result* output ) { using RealType = Real; using IndexType = Index; using ResultType = Result; Loading @@ -69,11 +67,12 @@ CudaReductionKernel( const Real& zero, const IndexType gridSize = blockDim.x * gridDim.x; sdata[ tid ] = zero; /*** * Read data into the shared memory. We start with the * sequential reduction. */ while( gid + 4 * gridSize < size ) /*while( gid + 4 * gridSize < size ) { sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); Loading @@ -86,15 +85,14 @@ CudaReductionKernel( const Real& zero, sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; } }*/ while( gid < size ) { sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = dataFetcher( gid ); //reduction( sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); return; //printf( "1: tid %d data %f \n", tid, sdata[ tid ] ); //return; Loading @@ -104,19 +102,19 @@ CudaReductionKernel( const Real& zero, if( blockSize >= 1024 ) { if( tid < 512 ) reduction( sdata[ tid ], sdata[ tid + 512 ] ); sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) reduction( sdata[ tid ], sdata[ tid + 256 ] ); sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) reduction( sdata[ tid ], sdata[ tid + 128 ] ); sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); //printf( "2: tid %d data %f \n", tid, sdata[ tid ] ); } Loading @@ -124,7 +122,7 @@ CudaReductionKernel( const Real& zero, if( blockSize >= 128 ) { if( tid < 64 ) reduction( sdata[ tid ], sdata[ tid + 64 ] ); sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); //printf( "3: tid %d data %f \n", tid, sdata[ tid ] ); } Loading @@ -138,34 +136,34 @@ CudaReductionKernel( const Real& zero, volatile ResultType* vsdata = sdata; if( blockSize >= 64 ) { reduction( vsdata[ tid ], vsdata[ tid + 32 ] ); vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 32 ] ); //printf( "4: tid %d data %f \n", tid, sdata[ tid ] ); } // TODO: If blocksize == 32, the following does not work // We do not check if tid < 16. Fix it!!! if( blockSize >= 32 ) { reduction( vsdata[ tid ], vsdata[ tid + 16 ] ); vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 16 ] ); //printf( "5: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 16 ) { reduction( vsdata[ tid ], vsdata[ tid + 8 ] ); vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 8 ] ); //printf( "6: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 8 ) { reduction( vsdata[ tid ], vsdata[ tid + 4 ] ); vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 4 ] ); //printf( "7: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 4 ) { reduction( vsdata[ tid ], vsdata[ tid + 2 ] ); vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 2 ] ); //printf( "8: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 2 ) { reduction( vsdata[ tid ], vsdata[ tid + 1 ] ); vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 1 ] ); //printf( "9: tid %d data %f \n", tid, sdata[ tid ] ); } } Loading @@ -181,8 +179,7 @@ CudaReductionKernel( const Real& zero, } template< typename Real, typename DataFetcher, template< typename DataFetcher, typename Reduction, typename Index, typename Result > Loading @@ -190,10 +187,9 @@ int CudaReductionKernelLauncher( const Index size, const Reduction& reduction, const DataFetcher& dataFetcher, const Real& zero, const Result& zero, Result*& output ) { using RealType = Real; using IndexType = Index; using ResultType = Result; Loading Loading @@ -236,49 +232,49 @@ CudaReductionKernelLauncher( const Index size, <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 256: cudaFuncSetCacheConfig(CudaReductionKernel< 256, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 256 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 128: cudaFuncSetCacheConfig(CudaReductionKernel< 128, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 128 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 64: cudaFuncSetCacheConfig(CudaReductionKernel< 64, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 64 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 32: cudaFuncSetCacheConfig(CudaReductionKernel< 32, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 32 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 16: cudaFuncSetCacheConfig(CudaReductionKernel< 16, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 16 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 8: cudaFuncSetCacheConfig(CudaReductionKernel< 8, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 8 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 4: cudaFuncSetCacheConfig(CudaReductionKernel< 4, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 4 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 2: cudaFuncSetCacheConfig(CudaReductionKernel< 2, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 2 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); Loading src/TNL/Containers/Algorithms/Reduction.h +6 −6 Original line number Diff line number Diff line Loading @@ -30,9 +30,9 @@ class Reduction< Devices::Cuda > { public: template< typename Index, typename Result, typename ReductionOperation, typename DataFetcher, typename Result = decltype( DataFetcher::operator() ) > typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, Loading @@ -45,9 +45,9 @@ class Reduction< Devices::Host > { public: template< typename Index, typename Result, typename ReductionOperation, typename DataFetcher, typename Result = decltype( DataFetcher::operator() ) > typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, Loading @@ -60,9 +60,9 @@ class Reduction< Devices::MIC > { public: template< typename Index, typename Result, typename ReductionOperation, typename DataFetcher, typename Result = decltype( DataFetcher::operator() ) > typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, Loading Loading
src/TNL/Containers/Algorithms/ArrayAssignment.h +0 −2 Original line number Diff line number Diff line Loading @@ -86,8 +86,6 @@ struct ArrayAssignment< Array, T, false > }; } // namespace Algorithms } // namespace Containers } // namespace TNL
src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +26 −8 Original line number Diff line number Diff line Loading @@ -205,8 +205,19 @@ compareMemory( const Element1* destination, { TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source ); Element1* d; cudaMalloc( ( void** ) &d, size * sizeof( Element1 ) ); auto fetch = [=] __cuda_callable__ ( Index i ) { return d[ 0 ]; }; //( destination[ i ] == source[ i ] ); }; auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, //[=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }, fetch, //[=] __cuda_callable__ ( Index i ) { return destination[ i ]; }, true ); /*Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );*/ } template< typename Element, Loading @@ -219,11 +230,11 @@ containsValue( const Element* data, { TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; //Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; //reductionContainsValue.setValue( value ); auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; return Reduction< Devices::Cuda >::reduce( size, TNL::sum, fetch, 0.0 ); auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a || b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false ); } template< typename Element, Loading @@ -237,9 +248,16 @@ containsOnlyValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; if( size == 0 ) return false; auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); /*Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; reductionContainsOnlyValue.setValue( value ); return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr ); return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );*/ } Loading
src/TNL/Containers/Algorithms/CommonVectorOperations.h 0 → 100644 +79 −0 Original line number Diff line number Diff line /*************************************************************************** CommonVectorOperations.h - description ------------------- begin : Apr 12, 2019 copyright : (C) 2019 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once namespace TNL { namespace Containers { namespace Algorithms { template< typename Device > struct CommonVectorOperations { using DeviceType = Device; template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorMax( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorMin( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorAbsMax( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorAbsMin( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorL1Norm( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorL2Norm( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType, typename Scalar > static ResultType getVectorLpNorm( const Vector& v, const Scalar p ); template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorSum( const Vector& v ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceMax( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceMin( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceAbsMax( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceAbsMin( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceL1Norm( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceL2Norm( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType, typename Scalar > static ResultType getVectorDifferenceLpNorm( const Vector1& v1, const Vector2& v2, const Scalar p ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 ); template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 ); }; } // namespace Algorithms } // namespace Containers } // namespace TNL #include <TNL/Containers/Algorithms/CommonVectorOperations.hpp>
src/TNL/Containers/Algorithms/CudaReductionKernel.h +29 −33 Original line number Diff line number Diff line Loading @@ -40,20 +40,18 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi #endif template< int blockSize, typename Real, typename Result, typename DataFetcher, typename Reduction, typename Index, typename Result = decltype( std::declval< DataFetcher >( 0,0 ) ) > typename Index > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionKernel( const Real& zero, CudaReductionKernel( const Result zero, const DataFetcher& dataFetcher, const Reduction& reduction, const Index size, Result* output ) { using RealType = Real; using IndexType = Index; using ResultType = Result; Loading @@ -69,11 +67,12 @@ CudaReductionKernel( const Real& zero, const IndexType gridSize = blockDim.x * gridDim.x; sdata[ tid ] = zero; /*** * Read data into the shared memory. We start with the * sequential reduction. */ while( gid + 4 * gridSize < size ) /*while( gid + 4 * gridSize < size ) { sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); Loading @@ -86,15 +85,14 @@ CudaReductionKernel( const Real& zero, sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; } }*/ while( gid < size ) { sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = dataFetcher( gid ); //reduction( sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); return; //printf( "1: tid %d data %f \n", tid, sdata[ tid ] ); //return; Loading @@ -104,19 +102,19 @@ CudaReductionKernel( const Real& zero, if( blockSize >= 1024 ) { if( tid < 512 ) reduction( sdata[ tid ], sdata[ tid + 512 ] ); sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) reduction( sdata[ tid ], sdata[ tid + 256 ] ); sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) reduction( sdata[ tid ], sdata[ tid + 128 ] ); sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); //printf( "2: tid %d data %f \n", tid, sdata[ tid ] ); } Loading @@ -124,7 +122,7 @@ CudaReductionKernel( const Real& zero, if( blockSize >= 128 ) { if( tid < 64 ) reduction( sdata[ tid ], sdata[ tid + 64 ] ); sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); //printf( "3: tid %d data %f \n", tid, sdata[ tid ] ); } Loading @@ -138,34 +136,34 @@ CudaReductionKernel( const Real& zero, volatile ResultType* vsdata = sdata; if( blockSize >= 64 ) { reduction( vsdata[ tid ], vsdata[ tid + 32 ] ); vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 32 ] ); //printf( "4: tid %d data %f \n", tid, sdata[ tid ] ); } // TODO: If blocksize == 32, the following does not work // We do not check if tid < 16. Fix it!!! if( blockSize >= 32 ) { reduction( vsdata[ tid ], vsdata[ tid + 16 ] ); vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 16 ] ); //printf( "5: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 16 ) { reduction( vsdata[ tid ], vsdata[ tid + 8 ] ); vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 8 ] ); //printf( "6: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 8 ) { reduction( vsdata[ tid ], vsdata[ tid + 4 ] ); vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 4 ] ); //printf( "7: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 4 ) { reduction( vsdata[ tid ], vsdata[ tid + 2 ] ); vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 2 ] ); //printf( "8: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 2 ) { reduction( vsdata[ tid ], vsdata[ tid + 1 ] ); vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 1 ] ); //printf( "9: tid %d data %f \n", tid, sdata[ tid ] ); } } Loading @@ -181,8 +179,7 @@ CudaReductionKernel( const Real& zero, } template< typename Real, typename DataFetcher, template< typename DataFetcher, typename Reduction, typename Index, typename Result > Loading @@ -190,10 +187,9 @@ int CudaReductionKernelLauncher( const Index size, const Reduction& reduction, const DataFetcher& dataFetcher, const Real& zero, const Result& zero, Result*& output ) { using RealType = Real; using IndexType = Index; using ResultType = Result; Loading Loading @@ -236,49 +232,49 @@ CudaReductionKernelLauncher( const Index size, <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 256: cudaFuncSetCacheConfig(CudaReductionKernel< 256, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 256 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 128: cudaFuncSetCacheConfig(CudaReductionKernel< 128, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 128 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 64: cudaFuncSetCacheConfig(CudaReductionKernel< 64, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 64 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 32: cudaFuncSetCacheConfig(CudaReductionKernel< 32, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 32 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 16: cudaFuncSetCacheConfig(CudaReductionKernel< 16, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 16 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 8: cudaFuncSetCacheConfig(CudaReductionKernel< 8, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 8 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 4: cudaFuncSetCacheConfig(CudaReductionKernel< 4, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 4 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 2: cudaFuncSetCacheConfig(CudaReductionKernel< 2, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 2 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); Loading
src/TNL/Containers/Algorithms/Reduction.h +6 −6 Original line number Diff line number Diff line Loading @@ -30,9 +30,9 @@ class Reduction< Devices::Cuda > { public: template< typename Index, typename Result, typename ReductionOperation, typename DataFetcher, typename Result = decltype( DataFetcher::operator() ) > typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, Loading @@ -45,9 +45,9 @@ class Reduction< Devices::Host > { public: template< typename Index, typename Result, typename ReductionOperation, typename DataFetcher, typename Result = decltype( DataFetcher::operator() ) > typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, Loading @@ -60,9 +60,9 @@ class Reduction< Devices::MIC > { public: template< typename Index, typename Result, typename ReductionOperation, typename DataFetcher, typename Result = decltype( DataFetcher::operator() ) > typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, Loading