Loading src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +318 −128 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRKernelScalar.h> namespace TNL { namespace Algorithms { Loading @@ -28,14 +29,17 @@ enum class Type { }; template< typename Index > union Block { Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { union Block { Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { this->index[0] = row; this->index[1] = index; this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; } Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept { Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept { this->index[0] = row; this->index[1] = 0; this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; Loading @@ -51,93 +55,177 @@ union Block { Block() = default; Type getType() const { if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) return Type::STREAM; if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) return Type::VECTOR; return Type::LONG; } Index getFirstRow() const { return index[ 0 ]; } Index getRowsInBlock() const { return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; } void print( std::ostream& str ) const { Type type = this->getType(); str << "Type: "; switch( type ) { case Type::STREAM: str << " Stream "; break; case Type::VECTOR: str << " Vector "; break; case Type::LONG: str << " Long "; break; } str << " first row: " << getFirstRow(); str << " rows per block: " << getRowsInBlock(); str << " index in warp: " << index[ 1 ]; } Index index[2]; // index[0] is row pointer, index[1] is index in warp uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID //twobytes[3/5] is nextRow - row }; template< typename Index > std::ostream& operator<< ( std::ostream& str, const Block< Index >& block ) { block.print( str ); return str; } #ifdef HAVE_CUDA template< typename Real, typename Index, int warpSize, template< int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP > __global__ void SpMVCSRAdaptive( const Real *inVector, Real *outVector, const Index* rowPointers, const Index* columnIndexes, const Real* values, const Block<Index> *blocks, int MAX_ELEM_PER_WARP, typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks, Index blocksSize, Index gridID) { int gridIdx, Offsets offsets, Index first, Index last, Fetch fetch, Reduction reduce, ResultKeeper keep, Real zero, Args... args ) { __shared__ Real shared[WARPS][SHARED_PER_WARP]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; constexpr size_t MAX_X_DIM = 2147483647; const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if (blockIdx >= blocksSize) return; Real result = 0.0; Real result = zero; bool compute( true ); const Index laneID = threadIdx.x & 31; // & is cheaper than % Block<Index> block = blocks[blockIdx]; const Index minID = rowPointers[block.index[0]/* minRow */]; const Index minID = offsets[block.index[0]/* minRow */]; Index i, to, maxID; if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) { /////////////////////////////////////* CSR STREAM *////////////// if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) { /**** * CSR Stream: Copy first all data into shared memory */ const Index warpID = threadIdx.x / 32; maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; /* Stream data to shared memory */ for (i = laneID + minID; i < maxID; i += warpSize) shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]]; for( Index globalIdx = laneID + minID; globalIdx < maxID; globalIdx += warpSize ) { shared[warpID][i - minID] = //fetch( globalIdx, compute ); details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); printf( "Stream: Fetch at %d -> %f \n", globalIdx, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ) ); // TODO:: fix this //values[i] * inVector[columnIndexes[i]]; } const Index maxRow = block.index[0]/* minRow */ + /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); /* Calculate result */ for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { to = rowPointers[i + 1] - minID; // end of preprocessed data result = 0; for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { to = offsets[i + 1] - minID; // end of preprocessed data result = zero; /* Scalar reduction */ for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID) result += shared[warpID][sharedID]; for( Index sharedID = offsets[ i ] - minID; sharedID < to; ++sharedID) result = reduce( result, shared[warpID][sharedID] ); outVector[i] = result; // Write result printf( "Stream: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result ); keep( i, result ); //outVector[i] = result; // Write result } } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) { } else //if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) { printf( "Vector: threadIdx = %d \n", threadIdx ); /////////////////////////////////////* CSR VECTOR *////////////// maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; const Index segmentIdx = block.index[0]; for (i = minID + laneID; i < maxID; i += warpSize) result += values[i] * inVector[columnIndexes[i]]; for( Index globalIdx = minID + laneID; globalIdx < maxID; globalIdx += warpSize ) result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx //values[i] * inVector[columnIndexes[i]]; /* Parallel reduction */ result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result } else { /////////////////////////////////////* CSR VECTOR L *///////////// /* Number of elements processed by previous warps */ const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP; to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP; maxID = rowPointers[block.index[0]/* minRow */ + 1]; if (to > maxID) to = maxID; for (i = minID + offset + laneID; i < to; i += warpSize) result += values[i] * inVector[columnIndexes[i]]; result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); if( laneID == 0 ) { printf( "Vector: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result ); keep( segmentIdx, result ); //outVector[block.index[0]/* minRow */] = result; // Write result } }/* else { ///////////////////////////////////// CSR VECTOR L ///////////// // Number of elements processed by previous warps const Index offset = block.index[1] * MAX_ELEM_PER_WARP; to = minID + (block.index[1] + 1) * MAX_ELEM_PER_WARP; maxID = offsets[block.index[0] + 1]; if( to > maxID ) to = maxID; for( Index globalIdx = minID + offset + laneID; globalIdx < to; globalIdx += warpSize ) result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; /* Parallel reduction */ result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result); } if (laneID == 0) atomicAdd(&outVector[block.index[0] ], result); }*/ } #endif Loading @@ -150,6 +238,20 @@ struct CSRKernelAdaptiveView using DeviceType = Device; using ViewType = CSRKernelAdaptiveView< Index, Device >; using ConstViewType = CSRKernelAdaptiveView< Index, Device >; using BlocksType = TNL::Containers::Vector< Block< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; CSRKernelAdaptiveView() = default; CSRKernelAdaptiveView( BlocksType& blocks ) { this->blocks.bind( blocks ); }; void setBlocks( BlocksType& blocks ) { this->blocks.bind( blocks ); } ViewType getView() { return *this; }; Loading @@ -170,38 +272,102 @@ struct CSRKernelAdaptiveView const Real& zero, Args... args ) const { #ifdef HAVE_CUDA if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) { TNL::Algorithms::Segments::CSRKernelScalar< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); return; } this->printBlocks(); static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; static constexpr Index THREADS_VECTOR = 128; static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp for CSR Light, MultiVector */ static constexpr Index MAX_ELEMENTS_PER_WARP = 384; Index blocks; const Index threads = matrix.THREADS_ADAPTIVE; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 24576; /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); /* Number of warps in block for CSR Adaptive */ static constexpr Index WARPS = THREADS_ADAPTIVE / 32; /* Number of elements in shared memory per one warp */ static constexpr Index SHARED_PER_WARP = SHARED / WARPS; constexpr int warpSize = 32; Index blocksCount; const Index threads = THREADS_ADAPTIVE; constexpr size_t MAX_X_DIM = 2147483647; /* Fill blocks */ size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block size_t neededThreads = blocks.getSize() * warpSize; // one warp per block /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { if (MAX_X_DIM * threads >= neededThreads) { blocksCount = roundUpDivision(neededThreads, threads); neededThreads = 0; } else { blocks = MAX_X_DIM; } else { blocksCount = MAX_X_DIM; neededThreads -= MAX_X_DIM * threads; } SpMVCSRAdaptive< Real, Index, warpSize, matrix.WARPS, matrix.SHARED_PER_WARP, matrix.MAX_ELEMENTS_PER_WARP_ADAPT > <<<blocks, threads>>>( inVector, outVector, matrix.getRowPointers().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), matrix.blocks.getData(), matrix.blocks.getSize() - 1, // last block shouldn't be used grid ); segmentsReductionCSRAdaptiveKernel< warpSize, WARPS, SHARED_PER_WARP, MAX_ELEMENTS_PER_WARP_ADAPT, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > <<<blocksCount, threads>>>( blocks.getData(), blocks.getSize() - 1, // last block shouldn't be used gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); } #endif } CSRKernelAdaptiveView& operator=( const CSRKernelAdaptiveView< Index, Device >& kernelView ) { this->blocks.bind( kernelView.blocks ); return *this; } void printBlocks() const { for( Index i = 0; i < this->blocks.getSize(); i++ ) { auto block = blocks.getElement( i ); std::cout << "Block " << i << " : " << block << std::endl; } } protected: BlocksView blocks; }; template< typename Index, Loading @@ -212,6 +378,9 @@ struct CSRKernelAdaptive using DeviceType = Device; using ViewType = CSRKernelAdaptiveView< Index, Device >; using ConstViewType = CSRKernelAdaptiveView< Index, Device >; using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; Loading @@ -227,23 +396,35 @@ struct CSRKernelAdaptive /* Number of elements in shared memory per one warp */ static constexpr Index SHARED_PER_WARP = SHARED / WARPS; /* Max length of row to process one warp for CSR Light, MultiVector */ static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; template< typename Offsets > Index findLimit(const Index start, const Offsets& offsets, const Index size, Type &type, Index &sum) { Index &sum) { sum = 0; for (Index current = start; current < size - 1; ++current) { for (Index current = start; current < size - 1; ++current) { Index elements = offsets.getElement(current + 1) - offsets.getElement(current); sum += elements; if (sum > matrix.SHARED_PER_WARP) { if (current - start > 0) { // extra row if (sum >SHARED_PER_WARP) { if (current - start > 0) { // extra row type = Type::STREAM; return current; } else { // one long row if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP_ADAPT) } else { // one long row if (sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT) type = Type::VECTOR; else type = Type::LONG; Loading @@ -251,7 +432,6 @@ struct CSRKernelAdaptive } } } type = Type::STREAM; return size - 1; // return last row pointer } Loading @@ -269,8 +449,7 @@ struct CSRKernelAdaptive while (nextStart != rows - 1) { Type type; nextStart = findLimit<Real, Index, Device, KernelType>( start, *this, rows, type, sum ); nextStart = findLimit( start, offsets, rows, type, sum ); if (type == Type::LONG) { Loading @@ -284,8 +463,8 @@ struct CSRKernelAdaptive { inBlock.emplace_back(start, type, nextStart, this->rowPointers.getElement(nextStart), this->rowPointers.getElement(start) ); offsets.getElement(nextStart), offsets.getElement(start) ); } start = nextStart; } Loading @@ -295,11 +474,19 @@ struct CSRKernelAdaptive this->blocks.setSize(inBlock.size()); for (size_t i = 0; i < inBlock.size(); ++i) this->blocks.setElement(i, inBlock[i]); this->view.setBlocks( blocks ); }; ViewType getView() { return view; }; void reset() { this->blocks.reset(); this->view.setBlocks( blocks ); } ViewType getView() { return this->view; }; ConstViewType getConstView() const { return ConstViewType(); }; ConstViewType getConstView() const { return this->view; }; template< typename OffsetsView, typename Fetch, Loading @@ -319,6 +506,9 @@ struct CSRKernelAdaptive view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } protected: BlocksType blocks; ViewType view; }; Loading src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp +2 −2 Original line number Diff line number Diff line /*************************************************************************** SparseMatrixTest_CSRHybrid.cpp - description SparseMatrixTest_CSRAdaptive.cpp - description ------------------- begin : Jan 23, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. Loading @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ #include "SparseMatrixTest_CSRHybrid.h" #include "SparseMatrixTest_CSRAdaptive.h" src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu +2 −2 Original line number Diff line number Diff line /*************************************************************************** SparseMatrixTest_CSRHybrid.cu - description SparseMatrixTest_CSRAdaptive.cu - description ------------------- begin : Jan 23, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. Loading @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ #include "SparseMatrixTest_CSRHybrid.h" #include "SparseMatrixTest_CSRAdaptive.h" src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h +18 −18 Original line number Diff line number Diff line /*************************************************************************** SparseMatrixTest_CSRHybrid.h - description SparseMatrixTest_CSRAdaptive.h - description ------------------- begin : Jan 23, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. Loading @@ -15,28 +15,28 @@ #ifdef HAVE_GTEST #include <gtest/gtest.h> const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments"; const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRAdaptive_segments"; // types for which MatrixTest is instantiated using MatrixTypes = ::testing::Types < TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid > TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > #ifdef HAVE_CUDA ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid > ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > #endif >; Loading Loading
src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +318 −128 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRKernelScalar.h> namespace TNL { namespace Algorithms { Loading @@ -28,14 +29,17 @@ enum class Type { }; template< typename Index > union Block { Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { union Block { Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { this->index[0] = row; this->index[1] = index; this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; } Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept { Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept { this->index[0] = row; this->index[1] = 0; this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; Loading @@ -51,93 +55,177 @@ union Block { Block() = default; Type getType() const { if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) return Type::STREAM; if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) return Type::VECTOR; return Type::LONG; } Index getFirstRow() const { return index[ 0 ]; } Index getRowsInBlock() const { return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; } void print( std::ostream& str ) const { Type type = this->getType(); str << "Type: "; switch( type ) { case Type::STREAM: str << " Stream "; break; case Type::VECTOR: str << " Vector "; break; case Type::LONG: str << " Long "; break; } str << " first row: " << getFirstRow(); str << " rows per block: " << getRowsInBlock(); str << " index in warp: " << index[ 1 ]; } Index index[2]; // index[0] is row pointer, index[1] is index in warp uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID //twobytes[3/5] is nextRow - row }; template< typename Index > std::ostream& operator<< ( std::ostream& str, const Block< Index >& block ) { block.print( str ); return str; } #ifdef HAVE_CUDA template< typename Real, typename Index, int warpSize, template< int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP > __global__ void SpMVCSRAdaptive( const Real *inVector, Real *outVector, const Index* rowPointers, const Index* columnIndexes, const Real* values, const Block<Index> *blocks, int MAX_ELEM_PER_WARP, typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks, Index blocksSize, Index gridID) { int gridIdx, Offsets offsets, Index first, Index last, Fetch fetch, Reduction reduce, ResultKeeper keep, Real zero, Args... args ) { __shared__ Real shared[WARPS][SHARED_PER_WARP]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; constexpr size_t MAX_X_DIM = 2147483647; const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if (blockIdx >= blocksSize) return; Real result = 0.0; Real result = zero; bool compute( true ); const Index laneID = threadIdx.x & 31; // & is cheaper than % Block<Index> block = blocks[blockIdx]; const Index minID = rowPointers[block.index[0]/* minRow */]; const Index minID = offsets[block.index[0]/* minRow */]; Index i, to, maxID; if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) { /////////////////////////////////////* CSR STREAM *////////////// if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) { /**** * CSR Stream: Copy first all data into shared memory */ const Index warpID = threadIdx.x / 32; maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; /* Stream data to shared memory */ for (i = laneID + minID; i < maxID; i += warpSize) shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]]; for( Index globalIdx = laneID + minID; globalIdx < maxID; globalIdx += warpSize ) { shared[warpID][i - minID] = //fetch( globalIdx, compute ); details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); printf( "Stream: Fetch at %d -> %f \n", globalIdx, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ) ); // TODO:: fix this //values[i] * inVector[columnIndexes[i]]; } const Index maxRow = block.index[0]/* minRow */ + /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); /* Calculate result */ for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { to = rowPointers[i + 1] - minID; // end of preprocessed data result = 0; for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { to = offsets[i + 1] - minID; // end of preprocessed data result = zero; /* Scalar reduction */ for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID) result += shared[warpID][sharedID]; for( Index sharedID = offsets[ i ] - minID; sharedID < to; ++sharedID) result = reduce( result, shared[warpID][sharedID] ); outVector[i] = result; // Write result printf( "Stream: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result ); keep( i, result ); //outVector[i] = result; // Write result } } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) { } else //if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) { printf( "Vector: threadIdx = %d \n", threadIdx ); /////////////////////////////////////* CSR VECTOR *////////////// maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; const Index segmentIdx = block.index[0]; for (i = minID + laneID; i < maxID; i += warpSize) result += values[i] * inVector[columnIndexes[i]]; for( Index globalIdx = minID + laneID; globalIdx < maxID; globalIdx += warpSize ) result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx //values[i] * inVector[columnIndexes[i]]; /* Parallel reduction */ result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result } else { /////////////////////////////////////* CSR VECTOR L *///////////// /* Number of elements processed by previous warps */ const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP; to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP; maxID = rowPointers[block.index[0]/* minRow */ + 1]; if (to > maxID) to = maxID; for (i = minID + offset + laneID; i < to; i += warpSize) result += values[i] * inVector[columnIndexes[i]]; result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); if( laneID == 0 ) { printf( "Vector: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result ); keep( segmentIdx, result ); //outVector[block.index[0]/* minRow */] = result; // Write result } }/* else { ///////////////////////////////////// CSR VECTOR L ///////////// // Number of elements processed by previous warps const Index offset = block.index[1] * MAX_ELEM_PER_WARP; to = minID + (block.index[1] + 1) * MAX_ELEM_PER_WARP; maxID = offsets[block.index[0] + 1]; if( to > maxID ) to = maxID; for( Index globalIdx = minID + offset + laneID; globalIdx < to; globalIdx += warpSize ) result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; /* Parallel reduction */ result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result); } if (laneID == 0) atomicAdd(&outVector[block.index[0] ], result); }*/ } #endif Loading @@ -150,6 +238,20 @@ struct CSRKernelAdaptiveView using DeviceType = Device; using ViewType = CSRKernelAdaptiveView< Index, Device >; using ConstViewType = CSRKernelAdaptiveView< Index, Device >; using BlocksType = TNL::Containers::Vector< Block< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; CSRKernelAdaptiveView() = default; CSRKernelAdaptiveView( BlocksType& blocks ) { this->blocks.bind( blocks ); }; void setBlocks( BlocksType& blocks ) { this->blocks.bind( blocks ); } ViewType getView() { return *this; }; Loading @@ -170,38 +272,102 @@ struct CSRKernelAdaptiveView const Real& zero, Args... args ) const { #ifdef HAVE_CUDA if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) { TNL::Algorithms::Segments::CSRKernelScalar< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); return; } this->printBlocks(); static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; static constexpr Index THREADS_VECTOR = 128; static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp for CSR Light, MultiVector */ static constexpr Index MAX_ELEMENTS_PER_WARP = 384; Index blocks; const Index threads = matrix.THREADS_ADAPTIVE; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 24576; /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); /* Number of warps in block for CSR Adaptive */ static constexpr Index WARPS = THREADS_ADAPTIVE / 32; /* Number of elements in shared memory per one warp */ static constexpr Index SHARED_PER_WARP = SHARED / WARPS; constexpr int warpSize = 32; Index blocksCount; const Index threads = THREADS_ADAPTIVE; constexpr size_t MAX_X_DIM = 2147483647; /* Fill blocks */ size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block size_t neededThreads = blocks.getSize() * warpSize; // one warp per block /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if (MAX_X_DIM * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { if (MAX_X_DIM * threads >= neededThreads) { blocksCount = roundUpDivision(neededThreads, threads); neededThreads = 0; } else { blocks = MAX_X_DIM; } else { blocksCount = MAX_X_DIM; neededThreads -= MAX_X_DIM * threads; } SpMVCSRAdaptive< Real, Index, warpSize, matrix.WARPS, matrix.SHARED_PER_WARP, matrix.MAX_ELEMENTS_PER_WARP_ADAPT > <<<blocks, threads>>>( inVector, outVector, matrix.getRowPointers().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), matrix.blocks.getData(), matrix.blocks.getSize() - 1, // last block shouldn't be used grid ); segmentsReductionCSRAdaptiveKernel< warpSize, WARPS, SHARED_PER_WARP, MAX_ELEMENTS_PER_WARP_ADAPT, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > <<<blocksCount, threads>>>( blocks.getData(), blocks.getSize() - 1, // last block shouldn't be used gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); } #endif } CSRKernelAdaptiveView& operator=( const CSRKernelAdaptiveView< Index, Device >& kernelView ) { this->blocks.bind( kernelView.blocks ); return *this; } void printBlocks() const { for( Index i = 0; i < this->blocks.getSize(); i++ ) { auto block = blocks.getElement( i ); std::cout << "Block " << i << " : " << block << std::endl; } } protected: BlocksView blocks; }; template< typename Index, Loading @@ -212,6 +378,9 @@ struct CSRKernelAdaptive using DeviceType = Device; using ViewType = CSRKernelAdaptiveView< Index, Device >; using ConstViewType = CSRKernelAdaptiveView< Index, Device >; using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; Loading @@ -227,23 +396,35 @@ struct CSRKernelAdaptive /* Number of elements in shared memory per one warp */ static constexpr Index SHARED_PER_WARP = SHARED / WARPS; /* Max length of row to process one warp for CSR Light, MultiVector */ static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; template< typename Offsets > Index findLimit(const Index start, const Offsets& offsets, const Index size, Type &type, Index &sum) { Index &sum) { sum = 0; for (Index current = start; current < size - 1; ++current) { for (Index current = start; current < size - 1; ++current) { Index elements = offsets.getElement(current + 1) - offsets.getElement(current); sum += elements; if (sum > matrix.SHARED_PER_WARP) { if (current - start > 0) { // extra row if (sum >SHARED_PER_WARP) { if (current - start > 0) { // extra row type = Type::STREAM; return current; } else { // one long row if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP_ADAPT) } else { // one long row if (sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT) type = Type::VECTOR; else type = Type::LONG; Loading @@ -251,7 +432,6 @@ struct CSRKernelAdaptive } } } type = Type::STREAM; return size - 1; // return last row pointer } Loading @@ -269,8 +449,7 @@ struct CSRKernelAdaptive while (nextStart != rows - 1) { Type type; nextStart = findLimit<Real, Index, Device, KernelType>( start, *this, rows, type, sum ); nextStart = findLimit( start, offsets, rows, type, sum ); if (type == Type::LONG) { Loading @@ -284,8 +463,8 @@ struct CSRKernelAdaptive { inBlock.emplace_back(start, type, nextStart, this->rowPointers.getElement(nextStart), this->rowPointers.getElement(start) ); offsets.getElement(nextStart), offsets.getElement(start) ); } start = nextStart; } Loading @@ -295,11 +474,19 @@ struct CSRKernelAdaptive this->blocks.setSize(inBlock.size()); for (size_t i = 0; i < inBlock.size(); ++i) this->blocks.setElement(i, inBlock[i]); this->view.setBlocks( blocks ); }; ViewType getView() { return view; }; void reset() { this->blocks.reset(); this->view.setBlocks( blocks ); } ViewType getView() { return this->view; }; ConstViewType getConstView() const { return ConstViewType(); }; ConstViewType getConstView() const { return this->view; }; template< typename OffsetsView, typename Fetch, Loading @@ -319,6 +506,9 @@ struct CSRKernelAdaptive view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } protected: BlocksType blocks; ViewType view; }; Loading
src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp +2 −2 Original line number Diff line number Diff line /*************************************************************************** SparseMatrixTest_CSRHybrid.cpp - description SparseMatrixTest_CSRAdaptive.cpp - description ------------------- begin : Jan 23, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. Loading @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ #include "SparseMatrixTest_CSRHybrid.h" #include "SparseMatrixTest_CSRAdaptive.h"
src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu +2 −2 Original line number Diff line number Diff line /*************************************************************************** SparseMatrixTest_CSRHybrid.cu - description SparseMatrixTest_CSRAdaptive.cu - description ------------------- begin : Jan 23, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. Loading @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ #include "SparseMatrixTest_CSRHybrid.h" #include "SparseMatrixTest_CSRAdaptive.h"
src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h +18 −18 Original line number Diff line number Diff line /*************************************************************************** SparseMatrixTest_CSRHybrid.h - description SparseMatrixTest_CSRAdaptive.h - description ------------------- begin : Jan 23, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. Loading @@ -15,28 +15,28 @@ #ifdef HAVE_GTEST #include <gtest/gtest.h> const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments"; const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRAdaptive_segments"; // types for which MatrixTest is instantiated using MatrixTypes = ::testing::Types < TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid > TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > #ifdef HAVE_CUDA ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid > ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > #endif >; Loading