Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +0 −21 Original line number Diff line number Diff line Loading @@ -67,27 +67,6 @@ struct CSRAdaptiveKernel static TNL::String getKernelType(); static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; // How many shared memory use per block in CSR Adaptive kernel static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::StreamedSharedMemory(); //20000; //24576; TODO: // Number of elements in shared memory static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); // Number of warps in block for CSR Adaptive static constexpr Index WARPS = THREADS_ADAPTIVE / 32; // Number of elements in shared memory per one warp static constexpr Index SHARED_PER_WARP = SHARED / WARPS; // Max length of row to process one warp for CSR Light, MultiVector static constexpr Index MAX_ELEMENTS_PER_WARP = 384; // Max length of row to process one warp for CSR Adaptive static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::MaxAdaptiveElementsPerWarp(); template< typename Offsets > void init( const Offsets& offsets ); Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -165,7 +165,7 @@ initValueSize( const Offsets& offsets ) if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize() / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; Loading src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +1 −1 Original line number Diff line number Diff line Loading @@ -24,7 +24,7 @@ struct CSRAdaptiveKernelParameters * * \return CUDA block size. */ static constexpr int CudaBlockSize() { return 256; }; //sizeof( Value ) == 8 ? 128 : 256; }; static constexpr int CudaBlockSize() { return 128; }; //sizeof( Value ) == 8 ? 128 : 256; }; //std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); }; /** Loading Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +0 −21 Original line number Diff line number Diff line Loading @@ -67,27 +67,6 @@ struct CSRAdaptiveKernel static TNL::String getKernelType(); static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; // How many shared memory use per block in CSR Adaptive kernel static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::StreamedSharedMemory(); //20000; //24576; TODO: // Number of elements in shared memory static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); // Number of warps in block for CSR Adaptive static constexpr Index WARPS = THREADS_ADAPTIVE / 32; // Number of elements in shared memory per one warp static constexpr Index SHARED_PER_WARP = SHARED / WARPS; // Max length of row to process one warp for CSR Light, MultiVector static constexpr Index MAX_ELEMENTS_PER_WARP = 384; // Max length of row to process one warp for CSR Adaptive static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::MaxAdaptiveElementsPerWarp(); template< typename Offsets > void init( const Offsets& offsets ); Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -165,7 +165,7 @@ initValueSize( const Offsets& offsets ) if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize() / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; Loading
src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +1 −1 Original line number Diff line number Diff line Loading @@ -24,7 +24,7 @@ struct CSRAdaptiveKernelParameters * * \return CUDA block size. */ static constexpr int CudaBlockSize() { return 256; }; //sizeof( Value ) == 8 ? 128 : 256; }; static constexpr int CudaBlockSize() { return 128; }; //sizeof( Value ) == 8 ? 128 : 256; }; //std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); }; /** Loading