Loading src/TNL/Algorithms/Segments/BiEllpack.hpp +5 −6 Original line number Diff line number Diff line Loading @@ -211,7 +211,7 @@ computeColumnSizes( const SizesHolder& segmentsSizes ) if( strip == numberOfStrips - 1 ) { IndexType segmentsCount = size - firstSegment; while( !( segmentsCount > TNL::pow( 2, getLogWarpSize() - 1 - emptyGroups ) ) ) while( segmentsCount <= TNL::pow( 2, getLogWarpSize() - 1 - emptyGroups ) - 1 ) emptyGroups++; for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ ) groupPointersView[ group ] = 0; Loading Loading @@ -290,7 +290,7 @@ template< typename Device, void BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >:: verifyRowLengths( const SizesHolder& segmentsSizes ) { bool ok = true; std::cerr << "segmentsSizes = " << segmentsSizes << std::endl; for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ ) { const IndexType strip = segmentIdx / getWarpSize(); Loading @@ -303,6 +303,7 @@ verifyRowLengths( const SizesHolder& segmentsSizes ) const IndexType groupsCount = details::BiEllpack< Index, Device, Organization, WarpSize >::getActiveGroupsCount( this->rowPermArray.getConstView(), segmentIdx ); for( IndexType group = 0; group < groupsCount; group++ ) { std::cerr << "groupIdx = " << group << " groupLength = " << this->getGroupLength( strip, group ) << std::endl; for( IndexType i = 0; i < this->getGroupLength( strip, group ); i++ ) { IndexType biElementPtr = elementPtr; Loading @@ -315,11 +316,9 @@ verifyRowLengths( const SizesHolder& segmentsSizes ) } } if( segmentsSizes.getElement( segmentIdx ) > rowLength ) ok = false; } if( ! ok ) throw( std::logic_error( "Segments capacities verification failed." ) ); } } template< typename Device, typename Index, Loading Loading @@ -349,7 +348,7 @@ setSegmentsSizes( const SizesHolder& segmentsSizes ) this->groupPointers.template scan< Algorithms::ScanType::Exclusive >(); this->verifyRowPerm( segmentsSizes ); this->verifyRowLengths( segmentsSizes ); //this->verifyRowLengths( segmentsSizes ); // TODO: I am not sure what this test is doing. this->storageSize = getWarpSize() * this->groupPointers.getElement( strips * ( getLogWarpSize() + 1 ) ); } else Loading src/TNL/Algorithms/Segments/BiEllpackView.hpp +75 −19 Original line number Diff line number Diff line Loading @@ -323,6 +323,8 @@ BiEllpackView< Device, Index, Organization, WarpSize >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; if( this->getStorageSize() == 0 ) return; if( std::is_same< DeviceType, Devices::Host >::value ) for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ ) { Loading @@ -335,11 +337,20 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio IndexType localIdx( 0 ); RealType aux( zero ); bool compute( true ); //std::cerr << "segmentIdx = " << segmentIdx // << " stripIdx = " << stripIdx // << " inStripIdx = " << inStripIdx // << " groupIdx = " << groupIdx // << " groupsCount = " << groupsCount // << std::endl; for( IndexType group = 0; group < groupsCount && compute; group++ ) { const IndexType groupSize = details::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getGroupSize( groupPointers, stripIdx, group ); IndexType groupWidth = groupSize / groupHeight; const IndexType globalIdxBack = globalIdx; //std::cerr << " groupSize = " << groupSize // << " groupWidth = " << groupWidth // << std::endl; if( Organization == RowMajorOrder ) globalIdx += inStripIdx * groupWidth; else Loading @@ -364,15 +375,16 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA constexpr int BlockDim = 256;//getWarpSize(); constexpr int BlockDim = 256; dim3 cudaBlockSize = BlockDim; const IndexType stripsCount = roundUpDivision( last - first, getWarpSize() ); const IndexType cudaBlocks = roundUpDivision( stripsCount * getWarpSize(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); IndexType sharedMemory = 0; if( ! Organization ) if( Organization == ColumnMajorOrder ) sharedMemory = cudaBlockSize.x * sizeof( RealType ); //printStructure( std::cerr ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { dim3 cudaGridSize = Cuda::getMaxGridSize(); Loading Loading @@ -450,7 +462,8 @@ printStructure( std::ostream& str ) const { const IndexType groupSize = groupPointers.getElement( groupIdx + 1 ) - groupPointers.getElement( groupIdx ); const IndexType groupWidth = groupSize / groupHeight; str << "\tGroup: " << groupIdx << " size = " << groupSize << " width = " << groupWidth << " height = " << groupHeight << std::endl; str << "\tGroup: " << groupIdx << " size = " << groupSize << " width = " << groupWidth << " height = " << groupHeight << " offset = " << groupPointers.getElement( groupIdx ) << std::endl; groupHeight /= 2; } } Loading Loading @@ -545,24 +558,51 @@ segmentsReductionKernel( IndexType gridIdx, if( warpStart >= last ) return; const int warpIdx = threadIdx.x / WarpSize; const int warpsCount = BlockDim / WarpSize; constexpr int groupsInStrip = 6; //getLogWarpSize() + 1; IndexType firstGroupIdx = strip * groupsInStrip; IndexType firstGroupInBlock = 8 * ( strip / 8 ) * groupsInStrip; IndexType groupHeight = getWarpSize(); IndexType firstGroupIdx = strip * ( getLogWarpSize() + 1 ); ///// // Allocate shared memory __shared__ RealType results[ BlockDim ]; results[ threadIdx.x ] = zero; __shared__ IndexType sharedGroupPointers[ 7 ]; // TODO: getLogWarpSize() + 1 ]; if( threadIdx.x <= getLogWarpSize() + 1 ) sharedGroupPointers[ threadIdx.x ] = this->groupPointers[ firstGroupIdx + threadIdx.x ]; __shared__ IndexType sharedGroupPointers[ groupsInStrip * warpsCount + 1 ]; ///// // Fetch group pointers to shared memory //bool b1 = ( threadIdx.x <= warpsCount * groupsInStrip ); //bool b2 = ( firstGroupIdx + threadIdx.x % groupsInStrip < this->groupPointers.getSize() ); //printf( "tid = %d warpsCount * groupsInStrip = %d firstGroupIdx + threadIdx.x = %d this->groupPointers.getSize() = %d read = %d %d\n", // threadIdx.x, warpsCount * groupsInStrip, // firstGroupIdx + threadIdx.x, // this->groupPointers.getSize(), ( int ) b1, ( int ) b2 ); if( threadIdx.x <= warpsCount * groupsInStrip && firstGroupInBlock + threadIdx.x < this->groupPointers.getSize() ) { sharedGroupPointers[ threadIdx.x ] = this->groupPointers[ firstGroupInBlock + threadIdx.x ]; //printf( " sharedGroupPointers[ %d ] = %d \n", // threadIdx.x, sharedGroupPointers[ threadIdx.x ] ); } const IndexType sharedGroupOffset = warpIdx * groupsInStrip; __syncthreads(); ///// // Perform the reduction bool compute( true ); if( Organization == RowMajorOrder ) { for( IndexType group = 0; group < getLogWarpSize() + 1; group++ ) { IndexType groupBegin = sharedGroupPointers[ group ]; IndexType groupEnd = sharedGroupPointers[ group + 1 ]; IndexType groupBegin = sharedGroupPointers[ sharedGroupOffset + group ]; IndexType groupEnd = sharedGroupPointers[ sharedGroupOffset + group + 1 ]; TNL_ASSERT_LT( groupBegin, this->getStorageSize(), "" ); //if( groupBegin >= this->getStorageSize() ) // printf( "tid = %d sharedGroupOffset + group + 1 = %d strip = %d group = %d groupBegin = %d groupEnd = %d this->getStorageSize() = %d\n", // threadIdx.x, sharedGroupOffset + group + 1, strip, group, groupBegin, groupEnd, this->getStorageSize() ); TNL_ASSERT_LT( groupEnd, this->getStorageSize(), "" ); if( groupEnd - groupBegin > 0 ) { if( inWarpIdx < groupHeight ) Loading @@ -570,7 +610,15 @@ segmentsReductionKernel( IndexType gridIdx, const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight; IndexType globalIdx = groupBegin + inWarpIdx * groupWidth; for( IndexType i = 0; i < groupWidth && compute; i++ ) { TNL_ASSERT_LT( globalIdx, this->getStorageSize(), "" ); results[ threadIdx.x ] = reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) ); //if( strip == 1 ) // printf( "tid = %d i = %d groupHeight = %d groupWidth = %d globalIdx = %d fetch = %f results = %f \n", // threadIdx.x, i, // groupHeight, groupWidth, // globalIdx, fetch( globalIdx, compute ), results[ threadIdx.x ] ); } } } groupHeight >>= 1; Loading @@ -581,8 +629,10 @@ segmentsReductionKernel( IndexType gridIdx, RealType* temp = Cuda::getSharedMemory< RealType >(); for( IndexType group = 0; group < getLogWarpSize() + 1; group++ ) { IndexType groupBegin = sharedGroupPointers[ group ]; IndexType groupEnd = sharedGroupPointers[ group + 1 ]; IndexType groupBegin = sharedGroupPointers[ sharedGroupOffset + group ]; IndexType groupEnd = sharedGroupPointers[ sharedGroupOffset + group + 1 ]; //if( threadIdx.x < 36 && strip == 1 ) // printf( " tid = %d strip = %d group = %d groupBegin = %d groupEnd = %d \n", threadIdx.x, strip, group, groupBegin, groupEnd ); if( groupEnd - groupBegin > 0 ) { temp[ threadIdx.x ] = zero; Loading @@ -590,6 +640,8 @@ segmentsReductionKernel( IndexType gridIdx, while( globalIdx < groupEnd ) { temp[ threadIdx.x ] = reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) ); //if( strip == 1 ) // printf( "tid %d fetch %f temp %f \n", threadIdx.x, fetch( globalIdx, compute ), temp[ threadIdx.x ] ); globalIdx += getWarpSize(); } // TODO: reduction via templates Loading Loading @@ -628,6 +680,10 @@ segmentsReductionKernel( IndexType gridIdx, if( warpStart + inWarpIdx >= last ) return; ///// // Store the results //if( strip == 1 ) // printf( "Adding %f at %d \n", results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( blockDim.x - 1 ) ], warpStart + inWarpIdx ); keeper( warpStart + inWarpIdx, results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( blockDim.x - 1 ) ] ); } #endif Loading src/UnitTests/Matrices/SparseMatrixTest.hpp +49 −1 Original line number Diff line number Diff line Loading @@ -1235,7 +1235,6 @@ void test_VectorProduct() EXPECT_EQ( outVector_4.getElement( 6 ), 280 ); EXPECT_EQ( outVector_4.getElement( 7 ), 330 ); /* * Sets up the following 8x8 sparse matrix: * Loading Loading @@ -1308,6 +1307,55 @@ void test_VectorProduct() EXPECT_EQ( outVector_5.getElement( 5 ), 224 ); EXPECT_EQ( outVector_5.getElement( 6 ), 352 ); EXPECT_EQ( outVector_5.getElement( 7 ), 520 ); ///// // Large test if( ( std::is_same< IndexType, int >::value || std::is_same< IndexType, long int >::value ) && std::is_same< RealType, double >::value ) { const IndexType size( 35 ); //for( int size = 1; size < 1000; size++ ) { //std::cerr << " size = " << size << std::endl; // Test with large diagonal matrix Matrix m1( size, size ); TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowCapacities( size ); rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return 1; } ); m1.setRowCapacities( rowCapacities ); auto f1 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) { if( localIdx == 0 ) { value = row + 1; column = row; } }; m1.forAllRows( f1 ); TNL::Containers::Vector< double, DeviceType, IndexType > in( size, 1.0 ), out( size, 0.0 ); m1.vectorProduct( in, out ); //std::cerr << out << std::endl; for( IndexType i = 0; i < size; i++ ) EXPECT_EQ( out.getElement( i ), i + 1 ); // Test with large triangular matrix Matrix m2( size, size ); rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return i + 1; } ); m2.setRowCapacities( rowCapacities ); auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) { if( localIdx <= row ) { value = row -localIdx + 1; column = localIdx; } }; m2.forAllRows( f2 ); out = 0.0; m2.vectorProduct( in, out ); //std::cerr << out << std::endl; for( IndexType i = 0; i < size; i++ ) EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 ); } } } template< typename Matrix > Loading src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h +4 −4 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ using ColumnMajorBiEllpack = TNL::Algorithms::Segments::BiEllpack< Device, Index // types for which MatrixTest is instantiated using MatrixTypes = ::testing::Types < TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >, Loading @@ -40,15 +40,15 @@ using MatrixTypes = ::testing::Types TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack > #ifdef HAVE_CUDA ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, ,TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack > #endif >; Loading src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h +1 −0 Original line number Diff line number Diff line Loading @@ -52,4 +52,5 @@ using MatrixTypes = ::testing::Types #endif #include "SparseMatrixTest.h" #include "../main.h" Loading
src/TNL/Algorithms/Segments/BiEllpack.hpp +5 −6 Original line number Diff line number Diff line Loading @@ -211,7 +211,7 @@ computeColumnSizes( const SizesHolder& segmentsSizes ) if( strip == numberOfStrips - 1 ) { IndexType segmentsCount = size - firstSegment; while( !( segmentsCount > TNL::pow( 2, getLogWarpSize() - 1 - emptyGroups ) ) ) while( segmentsCount <= TNL::pow( 2, getLogWarpSize() - 1 - emptyGroups ) - 1 ) emptyGroups++; for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ ) groupPointersView[ group ] = 0; Loading Loading @@ -290,7 +290,7 @@ template< typename Device, void BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >:: verifyRowLengths( const SizesHolder& segmentsSizes ) { bool ok = true; std::cerr << "segmentsSizes = " << segmentsSizes << std::endl; for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ ) { const IndexType strip = segmentIdx / getWarpSize(); Loading @@ -303,6 +303,7 @@ verifyRowLengths( const SizesHolder& segmentsSizes ) const IndexType groupsCount = details::BiEllpack< Index, Device, Organization, WarpSize >::getActiveGroupsCount( this->rowPermArray.getConstView(), segmentIdx ); for( IndexType group = 0; group < groupsCount; group++ ) { std::cerr << "groupIdx = " << group << " groupLength = " << this->getGroupLength( strip, group ) << std::endl; for( IndexType i = 0; i < this->getGroupLength( strip, group ); i++ ) { IndexType biElementPtr = elementPtr; Loading @@ -315,11 +316,9 @@ verifyRowLengths( const SizesHolder& segmentsSizes ) } } if( segmentsSizes.getElement( segmentIdx ) > rowLength ) ok = false; } if( ! ok ) throw( std::logic_error( "Segments capacities verification failed." ) ); } } template< typename Device, typename Index, Loading Loading @@ -349,7 +348,7 @@ setSegmentsSizes( const SizesHolder& segmentsSizes ) this->groupPointers.template scan< Algorithms::ScanType::Exclusive >(); this->verifyRowPerm( segmentsSizes ); this->verifyRowLengths( segmentsSizes ); //this->verifyRowLengths( segmentsSizes ); // TODO: I am not sure what this test is doing. this->storageSize = getWarpSize() * this->groupPointers.getElement( strips * ( getLogWarpSize() + 1 ) ); } else Loading
src/TNL/Algorithms/Segments/BiEllpackView.hpp +75 −19 Original line number Diff line number Diff line Loading @@ -323,6 +323,8 @@ BiEllpackView< Device, Index, Organization, WarpSize >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; if( this->getStorageSize() == 0 ) return; if( std::is_same< DeviceType, Devices::Host >::value ) for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ ) { Loading @@ -335,11 +337,20 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio IndexType localIdx( 0 ); RealType aux( zero ); bool compute( true ); //std::cerr << "segmentIdx = " << segmentIdx // << " stripIdx = " << stripIdx // << " inStripIdx = " << inStripIdx // << " groupIdx = " << groupIdx // << " groupsCount = " << groupsCount // << std::endl; for( IndexType group = 0; group < groupsCount && compute; group++ ) { const IndexType groupSize = details::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getGroupSize( groupPointers, stripIdx, group ); IndexType groupWidth = groupSize / groupHeight; const IndexType globalIdxBack = globalIdx; //std::cerr << " groupSize = " << groupSize // << " groupWidth = " << groupWidth // << std::endl; if( Organization == RowMajorOrder ) globalIdx += inStripIdx * groupWidth; else Loading @@ -364,15 +375,16 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA constexpr int BlockDim = 256;//getWarpSize(); constexpr int BlockDim = 256; dim3 cudaBlockSize = BlockDim; const IndexType stripsCount = roundUpDivision( last - first, getWarpSize() ); const IndexType cudaBlocks = roundUpDivision( stripsCount * getWarpSize(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); IndexType sharedMemory = 0; if( ! Organization ) if( Organization == ColumnMajorOrder ) sharedMemory = cudaBlockSize.x * sizeof( RealType ); //printStructure( std::cerr ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { dim3 cudaGridSize = Cuda::getMaxGridSize(); Loading Loading @@ -450,7 +462,8 @@ printStructure( std::ostream& str ) const { const IndexType groupSize = groupPointers.getElement( groupIdx + 1 ) - groupPointers.getElement( groupIdx ); const IndexType groupWidth = groupSize / groupHeight; str << "\tGroup: " << groupIdx << " size = " << groupSize << " width = " << groupWidth << " height = " << groupHeight << std::endl; str << "\tGroup: " << groupIdx << " size = " << groupSize << " width = " << groupWidth << " height = " << groupHeight << " offset = " << groupPointers.getElement( groupIdx ) << std::endl; groupHeight /= 2; } } Loading Loading @@ -545,24 +558,51 @@ segmentsReductionKernel( IndexType gridIdx, if( warpStart >= last ) return; const int warpIdx = threadIdx.x / WarpSize; const int warpsCount = BlockDim / WarpSize; constexpr int groupsInStrip = 6; //getLogWarpSize() + 1; IndexType firstGroupIdx = strip * groupsInStrip; IndexType firstGroupInBlock = 8 * ( strip / 8 ) * groupsInStrip; IndexType groupHeight = getWarpSize(); IndexType firstGroupIdx = strip * ( getLogWarpSize() + 1 ); ///// // Allocate shared memory __shared__ RealType results[ BlockDim ]; results[ threadIdx.x ] = zero; __shared__ IndexType sharedGroupPointers[ 7 ]; // TODO: getLogWarpSize() + 1 ]; if( threadIdx.x <= getLogWarpSize() + 1 ) sharedGroupPointers[ threadIdx.x ] = this->groupPointers[ firstGroupIdx + threadIdx.x ]; __shared__ IndexType sharedGroupPointers[ groupsInStrip * warpsCount + 1 ]; ///// // Fetch group pointers to shared memory //bool b1 = ( threadIdx.x <= warpsCount * groupsInStrip ); //bool b2 = ( firstGroupIdx + threadIdx.x % groupsInStrip < this->groupPointers.getSize() ); //printf( "tid = %d warpsCount * groupsInStrip = %d firstGroupIdx + threadIdx.x = %d this->groupPointers.getSize() = %d read = %d %d\n", // threadIdx.x, warpsCount * groupsInStrip, // firstGroupIdx + threadIdx.x, // this->groupPointers.getSize(), ( int ) b1, ( int ) b2 ); if( threadIdx.x <= warpsCount * groupsInStrip && firstGroupInBlock + threadIdx.x < this->groupPointers.getSize() ) { sharedGroupPointers[ threadIdx.x ] = this->groupPointers[ firstGroupInBlock + threadIdx.x ]; //printf( " sharedGroupPointers[ %d ] = %d \n", // threadIdx.x, sharedGroupPointers[ threadIdx.x ] ); } const IndexType sharedGroupOffset = warpIdx * groupsInStrip; __syncthreads(); ///// // Perform the reduction bool compute( true ); if( Organization == RowMajorOrder ) { for( IndexType group = 0; group < getLogWarpSize() + 1; group++ ) { IndexType groupBegin = sharedGroupPointers[ group ]; IndexType groupEnd = sharedGroupPointers[ group + 1 ]; IndexType groupBegin = sharedGroupPointers[ sharedGroupOffset + group ]; IndexType groupEnd = sharedGroupPointers[ sharedGroupOffset + group + 1 ]; TNL_ASSERT_LT( groupBegin, this->getStorageSize(), "" ); //if( groupBegin >= this->getStorageSize() ) // printf( "tid = %d sharedGroupOffset + group + 1 = %d strip = %d group = %d groupBegin = %d groupEnd = %d this->getStorageSize() = %d\n", // threadIdx.x, sharedGroupOffset + group + 1, strip, group, groupBegin, groupEnd, this->getStorageSize() ); TNL_ASSERT_LT( groupEnd, this->getStorageSize(), "" ); if( groupEnd - groupBegin > 0 ) { if( inWarpIdx < groupHeight ) Loading @@ -570,7 +610,15 @@ segmentsReductionKernel( IndexType gridIdx, const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight; IndexType globalIdx = groupBegin + inWarpIdx * groupWidth; for( IndexType i = 0; i < groupWidth && compute; i++ ) { TNL_ASSERT_LT( globalIdx, this->getStorageSize(), "" ); results[ threadIdx.x ] = reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) ); //if( strip == 1 ) // printf( "tid = %d i = %d groupHeight = %d groupWidth = %d globalIdx = %d fetch = %f results = %f \n", // threadIdx.x, i, // groupHeight, groupWidth, // globalIdx, fetch( globalIdx, compute ), results[ threadIdx.x ] ); } } } groupHeight >>= 1; Loading @@ -581,8 +629,10 @@ segmentsReductionKernel( IndexType gridIdx, RealType* temp = Cuda::getSharedMemory< RealType >(); for( IndexType group = 0; group < getLogWarpSize() + 1; group++ ) { IndexType groupBegin = sharedGroupPointers[ group ]; IndexType groupEnd = sharedGroupPointers[ group + 1 ]; IndexType groupBegin = sharedGroupPointers[ sharedGroupOffset + group ]; IndexType groupEnd = sharedGroupPointers[ sharedGroupOffset + group + 1 ]; //if( threadIdx.x < 36 && strip == 1 ) // printf( " tid = %d strip = %d group = %d groupBegin = %d groupEnd = %d \n", threadIdx.x, strip, group, groupBegin, groupEnd ); if( groupEnd - groupBegin > 0 ) { temp[ threadIdx.x ] = zero; Loading @@ -590,6 +640,8 @@ segmentsReductionKernel( IndexType gridIdx, while( globalIdx < groupEnd ) { temp[ threadIdx.x ] = reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) ); //if( strip == 1 ) // printf( "tid %d fetch %f temp %f \n", threadIdx.x, fetch( globalIdx, compute ), temp[ threadIdx.x ] ); globalIdx += getWarpSize(); } // TODO: reduction via templates Loading Loading @@ -628,6 +680,10 @@ segmentsReductionKernel( IndexType gridIdx, if( warpStart + inWarpIdx >= last ) return; ///// // Store the results //if( strip == 1 ) // printf( "Adding %f at %d \n", results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( blockDim.x - 1 ) ], warpStart + inWarpIdx ); keeper( warpStart + inWarpIdx, results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( blockDim.x - 1 ) ] ); } #endif Loading
src/UnitTests/Matrices/SparseMatrixTest.hpp +49 −1 Original line number Diff line number Diff line Loading @@ -1235,7 +1235,6 @@ void test_VectorProduct() EXPECT_EQ( outVector_4.getElement( 6 ), 280 ); EXPECT_EQ( outVector_4.getElement( 7 ), 330 ); /* * Sets up the following 8x8 sparse matrix: * Loading Loading @@ -1308,6 +1307,55 @@ void test_VectorProduct() EXPECT_EQ( outVector_5.getElement( 5 ), 224 ); EXPECT_EQ( outVector_5.getElement( 6 ), 352 ); EXPECT_EQ( outVector_5.getElement( 7 ), 520 ); ///// // Large test if( ( std::is_same< IndexType, int >::value || std::is_same< IndexType, long int >::value ) && std::is_same< RealType, double >::value ) { const IndexType size( 35 ); //for( int size = 1; size < 1000; size++ ) { //std::cerr << " size = " << size << std::endl; // Test with large diagonal matrix Matrix m1( size, size ); TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowCapacities( size ); rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return 1; } ); m1.setRowCapacities( rowCapacities ); auto f1 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) { if( localIdx == 0 ) { value = row + 1; column = row; } }; m1.forAllRows( f1 ); TNL::Containers::Vector< double, DeviceType, IndexType > in( size, 1.0 ), out( size, 0.0 ); m1.vectorProduct( in, out ); //std::cerr << out << std::endl; for( IndexType i = 0; i < size; i++ ) EXPECT_EQ( out.getElement( i ), i + 1 ); // Test with large triangular matrix Matrix m2( size, size ); rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return i + 1; } ); m2.setRowCapacities( rowCapacities ); auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) { if( localIdx <= row ) { value = row -localIdx + 1; column = localIdx; } }; m2.forAllRows( f2 ); out = 0.0; m2.vectorProduct( in, out ); //std::cerr << out << std::endl; for( IndexType i = 0; i < size; i++ ) EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 ); } } } template< typename Matrix > Loading
src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h +4 −4 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ using ColumnMajorBiEllpack = TNL::Algorithms::Segments::BiEllpack< Device, Index // types for which MatrixTest is instantiated using MatrixTypes = ::testing::Types < TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >, Loading @@ -40,15 +40,15 @@ using MatrixTypes = ::testing::Types TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack > #ifdef HAVE_CUDA ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, ,TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack > #endif >; Loading
src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h +1 −0 Original line number Diff line number Diff line Loading @@ -52,4 +52,5 @@ using MatrixTypes = ::testing::Types #endif #include "SparseMatrixTest.h" #include "../main.h"