Commit 3c945fad authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Fixing BiEllpack.

parent f6ace3fc
Loading
Loading
Loading
Loading
+5 −6
Original line number Diff line number Diff line
@@ -211,7 +211,7 @@ computeColumnSizes( const SizesHolder& segmentsSizes )
      if( strip == numberOfStrips - 1 )
      {
         IndexType segmentsCount = size - firstSegment;
         while( !( segmentsCount > TNL::pow( 2, getLogWarpSize() - 1 - emptyGroups ) ) )
         while( segmentsCount <= TNL::pow( 2, getLogWarpSize() - 1 - emptyGroups ) - 1 )
            emptyGroups++;
         for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ )
            groupPointersView[ group ] = 0;
@@ -290,7 +290,7 @@ template< typename Device,
void BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
verifyRowLengths( const SizesHolder& segmentsSizes )
{
   bool ok = true;
   std::cerr << "segmentsSizes = " << segmentsSizes << std::endl;
   for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ )
   {
      const IndexType strip = segmentIdx / getWarpSize();
@@ -303,6 +303,7 @@ verifyRowLengths( const SizesHolder& segmentsSizes )
      const IndexType groupsCount = details::BiEllpack< Index, Device, Organization, WarpSize >::getActiveGroupsCount( this->rowPermArray.getConstView(), segmentIdx );
      for( IndexType group = 0; group < groupsCount; group++ )
      {
         std::cerr << "groupIdx = " << group << " groupLength = " << this->getGroupLength( strip, group ) << std::endl;
         for( IndexType i = 0; i < this->getGroupLength( strip, group ); i++ )
         {
            IndexType biElementPtr = elementPtr;
@@ -315,11 +316,9 @@ verifyRowLengths( const SizesHolder& segmentsSizes )
         }
      }
      if( segmentsSizes.getElement( segmentIdx ) > rowLength )
         ok = false;
   }
   if( ! ok )
         throw( std::logic_error( "Segments capacities verification failed." ) );
   }
}

template< typename Device,
          typename Index,
@@ -349,7 +348,7 @@ setSegmentsSizes( const SizesHolder& segmentsSizes )
      this->groupPointers.template scan< Algorithms::ScanType::Exclusive >();

      this->verifyRowPerm( segmentsSizes );
      this->verifyRowLengths( segmentsSizes );
      //this->verifyRowLengths( segmentsSizes ); // TODO: I am not sure what this test is doing.
      this->storageSize =  getWarpSize() * this->groupPointers.getElement( strips * ( getLogWarpSize() + 1 ) );
   }
   else
+75 −19
Original line number Diff line number Diff line
@@ -323,6 +323,8 @@ BiEllpackView< Device, Index, Organization, WarpSize >::
segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
{
   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
   if( this->getStorageSize() == 0 )
      return;
   if( std::is_same< DeviceType, Devices::Host >::value )
      for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ )
      {
@@ -335,11 +337,20 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
         IndexType localIdx( 0 );
         RealType aux( zero );
         bool compute( true );
         //std::cerr << "segmentIdx = " << segmentIdx
         //          << " stripIdx = " << stripIdx
         //          << " inStripIdx = " << inStripIdx
         //          << " groupIdx = " << groupIdx
         //         << " groupsCount = " << groupsCount
         //          << std::endl;
         for( IndexType group = 0; group < groupsCount && compute; group++ )
         {
            const IndexType groupSize = details::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getGroupSize( groupPointers, stripIdx, group );
            IndexType groupWidth = groupSize / groupHeight;
            const IndexType globalIdxBack = globalIdx;
            //std::cerr << "  groupSize = " << groupSize
            //          << " groupWidth = " << groupWidth
            //          << std::endl;
            if( Organization == RowMajorOrder )
               globalIdx += inStripIdx * groupWidth;
            else
@@ -364,15 +375,16 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
   if( std::is_same< DeviceType, Devices::Cuda >::value )
   {
#ifdef HAVE_CUDA
      constexpr int BlockDim = 256;//getWarpSize();
      constexpr int BlockDim = 256;
      dim3 cudaBlockSize = BlockDim;
      const IndexType stripsCount = roundUpDivision( last - first, getWarpSize() );
      const IndexType cudaBlocks = roundUpDivision( stripsCount * getWarpSize(), cudaBlockSize.x );
      const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
      IndexType sharedMemory = 0;
      if( ! Organization )
      if( Organization == ColumnMajorOrder )
         sharedMemory = cudaBlockSize.x * sizeof( RealType );

      //printStructure( std::cerr );
      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
      {
         dim3 cudaGridSize = Cuda::getMaxGridSize();
@@ -450,7 +462,8 @@ printStructure( std::ostream& str ) const
      {
         const IndexType groupSize = groupPointers.getElement( groupIdx + 1 ) - groupPointers.getElement( groupIdx );
         const IndexType groupWidth = groupSize / groupHeight;
         str << "\tGroup: " << groupIdx << " size = " << groupSize << " width = " << groupWidth << " height = " << groupHeight << std::endl;
         str << "\tGroup: " << groupIdx << " size = " << groupSize << " width = " << groupWidth << " height = " << groupHeight 
             << " offset = " << groupPointers.getElement( groupIdx ) << std::endl;
         groupHeight /= 2;
      }
   }
@@ -545,24 +558,51 @@ segmentsReductionKernel( IndexType gridIdx,
   if( warpStart >= last )
      return;

   const int warpIdx = threadIdx.x / WarpSize;
   const int warpsCount = BlockDim / WarpSize;
   constexpr int groupsInStrip = 6; //getLogWarpSize() + 1;
   IndexType firstGroupIdx = strip * groupsInStrip;
   IndexType firstGroupInBlock = 8 * ( strip / 8 ) * groupsInStrip;
   IndexType groupHeight = getWarpSize();
   IndexType firstGroupIdx = strip * ( getLogWarpSize() + 1 );

   /////
   // Allocate shared memory
   __shared__ RealType results[ BlockDim ];
   results[ threadIdx.x ] = zero;
   __shared__ IndexType sharedGroupPointers[ 7 ]; // TODO: getLogWarpSize() + 1 ];

   if( threadIdx.x <= getLogWarpSize() + 1 )
      sharedGroupPointers[ threadIdx.x ] = this->groupPointers[ firstGroupIdx + threadIdx.x ];
   __shared__ IndexType sharedGroupPointers[ groupsInStrip * warpsCount + 1 ];

   /////
   // Fetch group pointers to shared memory
   //bool b1 = ( threadIdx.x <= warpsCount * groupsInStrip ); 
   //bool b2 = ( firstGroupIdx + threadIdx.x % groupsInStrip < this->groupPointers.getSize() );
   //printf( "tid = %d warpsCount * groupsInStrip = %d firstGroupIdx + threadIdx.x = %d this->groupPointers.getSize() = %d read = %d %d\n",
   //   threadIdx.x, warpsCount * groupsInStrip,
   //   firstGroupIdx + threadIdx.x,
   //   this->groupPointers.getSize(), ( int ) b1, ( int ) b2 );
   if( threadIdx.x <= warpsCount * groupsInStrip && 
      firstGroupInBlock + threadIdx.x < this->groupPointers.getSize() )
   {
      sharedGroupPointers[ threadIdx.x ] = this->groupPointers[ firstGroupInBlock + threadIdx.x ];
      //printf( " sharedGroupPointers[ %d ] = %d \n",
      //   threadIdx.x, sharedGroupPointers[ threadIdx.x ] );
   }
   const IndexType sharedGroupOffset = warpIdx * groupsInStrip;
   __syncthreads();

   /////
   // Perform the reduction
   bool compute( true );
   if( Organization == RowMajorOrder )
   {
      for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
      {
         IndexType groupBegin = sharedGroupPointers[ group ];
         IndexType groupEnd = sharedGroupPointers[ group + 1 ];
         IndexType groupBegin = sharedGroupPointers[ sharedGroupOffset + group ];
         IndexType groupEnd = sharedGroupPointers[ sharedGroupOffset + group + 1 ];
         TNL_ASSERT_LT( groupBegin, this->getStorageSize(), "" );
         //if( groupBegin >= this->getStorageSize() )
         //   printf( "tid = %d sharedGroupOffset + group + 1 = %d strip = %d group = %d groupBegin = %d groupEnd = %d this->getStorageSize() = %d\n", 
         //      threadIdx.x, sharedGroupOffset + group + 1, strip, group, groupBegin, groupEnd, this->getStorageSize() );
         TNL_ASSERT_LT( groupEnd, this->getStorageSize(), "" );
         if( groupEnd - groupBegin > 0 )
         {
            if( inWarpIdx < groupHeight )
@@ -570,7 +610,15 @@ segmentsReductionKernel( IndexType gridIdx,
               const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight;
               IndexType globalIdx = groupBegin + inWarpIdx * groupWidth;
               for( IndexType i = 0; i < groupWidth && compute; i++ )
               {
                  TNL_ASSERT_LT( globalIdx, this->getStorageSize(), "" );
                  results[ threadIdx.x ] = reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) );
                  //if( strip == 1 )
                  //  printf( "tid = %d i = %d groupHeight = %d groupWidth = %d globalIdx = %d fetch = %f results = %f \n",
                  //      threadIdx.x, i,
                  //      groupHeight, groupWidth,
                  //      globalIdx, fetch( globalIdx, compute ), results[ threadIdx.x ] );
               }
            }
         }
         groupHeight >>= 1;
@@ -581,8 +629,10 @@ segmentsReductionKernel( IndexType gridIdx,
      RealType* temp = Cuda::getSharedMemory< RealType >();
      for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
      {
         IndexType groupBegin = sharedGroupPointers[ group ];
         IndexType groupEnd = sharedGroupPointers[ group + 1 ];
         IndexType groupBegin = sharedGroupPointers[ sharedGroupOffset + group ];
         IndexType groupEnd = sharedGroupPointers[ sharedGroupOffset + group + 1 ];
         //if( threadIdx.x < 36 && strip == 1 )
         //   printf( " tid = %d strip = %d group = %d groupBegin = %d groupEnd = %d \n", threadIdx.x, strip, group, groupBegin, groupEnd );
         if( groupEnd - groupBegin > 0 )
         {
            temp[ threadIdx.x ] = zero;
@@ -590,6 +640,8 @@ segmentsReductionKernel( IndexType gridIdx,
            while( globalIdx < groupEnd )
            {
               temp[ threadIdx.x ] = reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) );
               //if( strip == 1 )
               //   printf( "tid %d fetch %f temp %f \n", threadIdx.x, fetch( globalIdx, compute ), temp[ threadIdx.x ] );               
               globalIdx += getWarpSize();
            }
            // TODO: reduction via templates
@@ -628,6 +680,10 @@ segmentsReductionKernel( IndexType gridIdx,
   if( warpStart + inWarpIdx >= last )
      return;

   /////
   // Store the results
   //if( strip == 1 )
   //   printf( "Adding %f at %d \n", results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( blockDim.x - 1 ) ], warpStart + inWarpIdx );
   keeper( warpStart + inWarpIdx, results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( blockDim.x - 1 ) ] );
}
#endif
+49 −1
Original line number Diff line number Diff line
@@ -1235,7 +1235,6 @@ void test_VectorProduct()
   EXPECT_EQ( outVector_4.getElement( 6 ), 280 );
   EXPECT_EQ( outVector_4.getElement( 7 ), 330 );


   /*
    * Sets up the following 8x8 sparse matrix:
    *
@@ -1308,6 +1307,55 @@ void test_VectorProduct()
   EXPECT_EQ( outVector_5.getElement( 5 ), 224 );
   EXPECT_EQ( outVector_5.getElement( 6 ), 352 );
   EXPECT_EQ( outVector_5.getElement( 7 ), 520 );

   /////
   // Large test
   if( ( std::is_same< IndexType, int >::value || std::is_same< IndexType, long int >::value ) &&
      std::is_same< RealType, double >::value )
   {
      const IndexType size( 35 );
      //for( int size = 1; size < 1000; size++ )
      {
         //std::cerr << " size = " << size << std::endl;
         // Test with large diagonal matrix
         Matrix m1( size, size );
         TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowCapacities( size );
         rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return 1; } );
         m1.setRowCapacities( rowCapacities );
         auto f1 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
            if( localIdx == 0  )
            {
               value = row + 1;
               column = row;
            }
         };
         m1.forAllRows( f1 );
         TNL::Containers::Vector< double, DeviceType, IndexType > in( size, 1.0 ), out( size, 0.0 );
         m1.vectorProduct( in, out );
         //std::cerr << out << std::endl;
         for( IndexType i = 0; i < size; i++ )
            EXPECT_EQ( out.getElement( i ), i + 1 );

         // Test with large triangular matrix
         Matrix m2( size, size );
         rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return i + 1; } );
         m2.setRowCapacities( rowCapacities );
         auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
            if( localIdx <= row )
            {
               value = row -localIdx + 1;
               column = localIdx;
            }
         };
         m2.forAllRows( f2 );
         out = 0.0;
         m2.vectorProduct( in, out );
         //std::cerr << out << std::endl;
         for( IndexType i = 0; i < size; i++ )
            EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 );
         
      }
   }
}

template< typename Matrix >
+4 −4
Original line number Diff line number Diff line
@@ -30,7 +30,7 @@ using ColumnMajorBiEllpack = TNL::Algorithms::Segments::BiEllpack< Device, Index
// types for which MatrixTest is instantiated
using MatrixTypes = ::testing::Types
<
    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
@@ -40,15 +40,15 @@ using MatrixTypes = ::testing::Types
    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >
#ifdef HAVE_CUDA
   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
   ,TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >
#endif
>;

+1 −0
Original line number Diff line number Diff line
@@ -52,4 +52,5 @@ using MatrixTypes = ::testing::Types

#endif

#include "SparseMatrixTest.h"
#include "../main.h"
Loading