Loading src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +54 −26 Original line number Diff line number Diff line Loading @@ -113,51 +113,78 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; block.getWarpIdx() * MAX_ELEM_PER_WARP; Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; /// block.getWarpIdx() * MAX_ELEM_PER_WARP; //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[segmentIdx + 1]; const int tid = threadIdx.x; //const int inBlockWarpIdx = block.getWarpIdx(); if( to > end ) to = end; //if( to > end ) // to = end; TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; //printf( "tid %d : start = %d \n", tid, minID + laneID ); for( Index globalIdx = begin + laneIdx + offset; globalIdx < to; globalIdx += warpSize ) //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //printf( "tid %d -> %d \n", tid, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //if( laneIdx == 0 ) // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; } //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); //if( laneIdx == 0 ) // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; __syncthreads(); // Reduction in multivectorShared if( tid < 16 ) if( block.getWarpIdx() == 0 && laneIdx < 16 ) { multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 16 ] ); constexpr int totalWarps = CudaBlockSize / warpSize; if( totalWarps >= 32 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); __syncwarp(); multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 8 ] ); } if( totalWarps >= 16 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); __syncwarp(); multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 4 ] ); } if( totalWarps >= 8 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); __syncwarp(); multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 2 ] ); } if( totalWarps >= 4 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); __syncwarp(); multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 1 ] ); } if( totalWarps >= 2 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); __syncwarp(); if( tid == 0 ) } if( laneIdx == 0 ) { printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); keep( segmentIdx, multivectorShared[ 0 ] ); } } Loading Loading @@ -216,7 +243,6 @@ struct CSRKernelAdaptiveView return; } this->printBlocks(); static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; Loading Loading @@ -322,7 +348,7 @@ struct CSRKernelAdaptive static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 24576; static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); Loading Loading @@ -364,7 +390,7 @@ struct CSRKernelAdaptive if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) type = details::Type::VECTOR; else type = details::Type::VECTOR; // TODO: Put LONG back type = details::Type::LONG; //type = Type::LONG; // return current + 1; } Loading Loading @@ -393,7 +419,9 @@ struct CSRKernelAdaptive { const Index blocksCount = inBlock.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); inBlock.emplace_back( start, details::Type::LONG, 0, warpsLeft ); for( Index index = 1; index < warpsLeft; index++ ) Loading src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h +5 −0 Original line number Diff line number Diff line Loading @@ -190,6 +190,11 @@ struct CSRAdaptiveKernelBlockDescriptor return this->warpIdx; } __cuda_callable__ uint8_t getWarpsCount() const { return this->warpsCount; } void print( std::ostream& str ) const { str << "Type: "; Loading src/UnitTests/Matrices/SparseMatrixTest.hpp +9 −8 Original line number Diff line number Diff line Loading @@ -1070,7 +1070,6 @@ void test_VectorProduct() outVector_1.setElement( j, 0 ); m_1.vectorProduct( inVector_1, outVector_1 ); EXPECT_EQ( outVector_1.getElement( 0 ), 2 ); EXPECT_EQ( outVector_1.getElement( 1 ), 10 ); EXPECT_EQ( outVector_1.getElement( 2 ), 8 ); Loading Loading @@ -1310,7 +1309,7 @@ void test_VectorProduct() ///// // Large test const IndexType size( 35 ); const IndexType size( 1051 ); //for( int size = 1; size < 1000; size++ ) { //std::cerr << " size = " << size << std::endl; Loading Loading @@ -1338,26 +1337,28 @@ void test_VectorProduct() EXPECT_EQ( out.getElement( i ), i + 1 ); // Test with large triangular matrix Matrix m2( size, size ); rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return i + 1; } ); const int rows( size ), columns( size ); Matrix m2( rows, columns ); rowCapacities.setSize( rows ); rowCapacities.evaluate( [=] __cuda_callable__ ( IndexType i ) { return i + 1; } ); m2.setRowCapacities( rowCapacities ); auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) { if( localIdx <= row ) { value = row -localIdx + 1; value = localIdx + 1; column = localIdx; } }; m2.forAllRows( f2 ); // check that the matrix was initialized TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( size ); TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( rows ); m2.getCompressedRowLengths( rowLengths ); EXPECT_EQ( rowLengths, rowCapacities ); out.setSize( rows ); out = 0.0; m2.vectorProduct( in, out ); //std::cerr << out << std::endl; for( IndexType i = 0; i < size; i++ ) for( IndexType i = 0; i < rows; i++ ) EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 ); } } Loading src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h +6 −6 Original line number Diff line number Diff line Loading @@ -30,12 +30,12 @@ using MatrixTypes = ::testing::Types TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > #ifdef HAVE_CUDA ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > #endif >; Loading Loading
src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +54 −26 Original line number Diff line number Diff line Loading @@ -113,51 +113,78 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; block.getWarpIdx() * MAX_ELEM_PER_WARP; Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; /// block.getWarpIdx() * MAX_ELEM_PER_WARP; //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[segmentIdx + 1]; const int tid = threadIdx.x; //const int inBlockWarpIdx = block.getWarpIdx(); if( to > end ) to = end; //if( to > end ) // to = end; TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; //printf( "tid %d : start = %d \n", tid, minID + laneID ); for( Index globalIdx = begin + laneIdx + offset; globalIdx < to; globalIdx += warpSize ) //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //printf( "tid %d -> %d \n", tid, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //if( laneIdx == 0 ) // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; } //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); //if( laneIdx == 0 ) // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; __syncthreads(); // Reduction in multivectorShared if( tid < 16 ) if( block.getWarpIdx() == 0 && laneIdx < 16 ) { multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 16 ] ); constexpr int totalWarps = CudaBlockSize / warpSize; if( totalWarps >= 32 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); __syncwarp(); multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 8 ] ); } if( totalWarps >= 16 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); __syncwarp(); multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 4 ] ); } if( totalWarps >= 8 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); __syncwarp(); multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 2 ] ); } if( totalWarps >= 4 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); __syncwarp(); multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 1 ] ); } if( totalWarps >= 2 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); __syncwarp(); if( tid == 0 ) } if( laneIdx == 0 ) { printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); keep( segmentIdx, multivectorShared[ 0 ] ); } } Loading Loading @@ -216,7 +243,6 @@ struct CSRKernelAdaptiveView return; } this->printBlocks(); static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; Loading Loading @@ -322,7 +348,7 @@ struct CSRKernelAdaptive static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 24576; static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); Loading Loading @@ -364,7 +390,7 @@ struct CSRKernelAdaptive if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) type = details::Type::VECTOR; else type = details::Type::VECTOR; // TODO: Put LONG back type = details::Type::LONG; //type = Type::LONG; // return current + 1; } Loading Loading @@ -393,7 +419,9 @@ struct CSRKernelAdaptive { const Index blocksCount = inBlock.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); inBlock.emplace_back( start, details::Type::LONG, 0, warpsLeft ); for( Index index = 1; index < warpsLeft; index++ ) Loading
src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h +5 −0 Original line number Diff line number Diff line Loading @@ -190,6 +190,11 @@ struct CSRAdaptiveKernelBlockDescriptor return this->warpIdx; } __cuda_callable__ uint8_t getWarpsCount() const { return this->warpsCount; } void print( std::ostream& str ) const { str << "Type: "; Loading
src/UnitTests/Matrices/SparseMatrixTest.hpp +9 −8 Original line number Diff line number Diff line Loading @@ -1070,7 +1070,6 @@ void test_VectorProduct() outVector_1.setElement( j, 0 ); m_1.vectorProduct( inVector_1, outVector_1 ); EXPECT_EQ( outVector_1.getElement( 0 ), 2 ); EXPECT_EQ( outVector_1.getElement( 1 ), 10 ); EXPECT_EQ( outVector_1.getElement( 2 ), 8 ); Loading Loading @@ -1310,7 +1309,7 @@ void test_VectorProduct() ///// // Large test const IndexType size( 35 ); const IndexType size( 1051 ); //for( int size = 1; size < 1000; size++ ) { //std::cerr << " size = " << size << std::endl; Loading Loading @@ -1338,26 +1337,28 @@ void test_VectorProduct() EXPECT_EQ( out.getElement( i ), i + 1 ); // Test with large triangular matrix Matrix m2( size, size ); rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return i + 1; } ); const int rows( size ), columns( size ); Matrix m2( rows, columns ); rowCapacities.setSize( rows ); rowCapacities.evaluate( [=] __cuda_callable__ ( IndexType i ) { return i + 1; } ); m2.setRowCapacities( rowCapacities ); auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) { if( localIdx <= row ) { value = row -localIdx + 1; value = localIdx + 1; column = localIdx; } }; m2.forAllRows( f2 ); // check that the matrix was initialized TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( size ); TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( rows ); m2.getCompressedRowLengths( rowLengths ); EXPECT_EQ( rowLengths, rowCapacities ); out.setSize( rows ); out = 0.0; m2.vectorProduct( in, out ); //std::cerr << out << std::endl; for( IndexType i = 0; i < size; i++ ) for( IndexType i = 0; i < rows; i++ ) EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 ); } } Loading
src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h +6 −6 Original line number Diff line number Diff line Loading @@ -30,12 +30,12 @@ using MatrixTypes = ::testing::Types TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > #ifdef HAVE_CUDA ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > #endif >; Loading