Commit 670f1892 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Tuning Ellpack format.

parent c2f4c4ab
Loading
Loading
Loading
Loading
+70 −1
Original line number Diff line number Diff line
@@ -115,6 +115,7 @@ bool tnlEllpackMatrix< Real, Device, Index >::setLike( const tnlEllpackMatrix< R
   if( ! tnlSparseMatrix< Real, Device, Index >::setLike( matrix ) )
      return false;
   this->rowLengths = matrix.rowLengths;
   this->alignedRows = matrix.alignedRows;
   return true;
}

@@ -125,6 +126,7 @@ void tnlEllpackMatrix< Real, Device, Index > :: reset()
{
   tnlSparseMatrix< Real, Device, Index >::reset();
   this->rowLengths = 0;
   this->alignedRows = 0;
}

template< typename Real,
@@ -693,6 +695,39 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost >
      }
};

#ifdef HAVE_CUDA    
template< 
   typename Real,
   typename Index >
__global__ void tnlEllpackMatrixVectorProductCudaKernel(
   const Index rows,
   const Index columns,
   const Index compressedRowsLengths,
   const Index alignedRows,
   const Index* columnIndexes,
   const Real* values,
   const Real* inVector,
   Real* outVector,
   const Index gridIdx )
{
   const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   if( rowIdx >= rows )
      return;
   Index i = rowIdx;
   Index el( 0 );
   Real result( 0.0 );
   Index columnIndex;
   while( el++ < compressedRowsLengths && ( columnIndex = columnIndexes[ i ] ) < columns )
   {
      result += values[ i ] * inVector[ columnIndex ];
      i += alignedRows;
   }
   outVector[ rowIdx ] = result;   
}
#endif



template<>
class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
{
@@ -734,7 +769,41 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
                                 const InVector& inVector,
                                 OutVector& outVector )
      {
         tnlMatrixVectorProductCuda( matrix, inVector, outVector );
         //tnlMatrixVectorProductCuda( matrix, inVector, outVector );
         #ifdef HAVE_CUDA    
            typedef tnlEllpackMatrix< Real, Device, Index > Matrix;
            typedef typename Matrix::IndexType IndexType;
            //Matrix* kernel_this = tnlCuda::passToDevice( matrix );
            //InVector* kernel_inVector = tnlCuda::passToDevice( inVector );
            //OutVector* kernel_outVector = tnlCuda::passToDevice( outVector );
            dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
            const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
            const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
            for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
            {
               if( gridIdx == cudaGrids - 1 )
                  cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
               tnlEllpackMatrixVectorProductCudaKernel
               < Real, Index >
                <<< cudaGridSize, cudaBlockSize >>>
                ( matrix.getRows(),
                  matrix.getColumns(),
                  matrix.rowLengths,
                  matrix.alignedRows,
                  matrix.columnIndexes.getData(),
                  matrix.values.getData(),
                  inVector.getData(),
                  outVector.getData(),
                  gridIdx );
               checkCudaDevice;
            }
            //tnlCuda::freeFromDevice( kernel_this );
            //tnlCuda::freeFromDevice( kernel_inVector );
            //tnlCuda::freeFromDevice( kernel_outVector );
            checkCudaDevice;
            cudaThreadSynchronize();
         #endif
         
      }
};

+58 −14
Original line number Diff line number Diff line
@@ -777,6 +777,42 @@ __global__ void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKerne
}
#endif

#ifdef HAVE_CUDA    
template< 
   typename Real,
   typename Index,
   int SliceSize >
__global__ void tnlSlicedEllpackMatrixVectorProductCudaKernel(
   const Index rows,
   const Index columns,
   const Index* slicePointers,
   const Index* sliceCompressedRowsLengths,
   const Index* columnIndexes,
   const Real* values,
   const Real* inVector,
   Real* outVector,
   const Index gridIdx )
{
   const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   if( rowIdx >= rows )
      return;
   const Index sliceIdx = rowIdx / SliceSize;
   const Index slicePointer = slicePointers[ sliceIdx ];
   const Index rowLength = sliceCompressedRowsLengths[ sliceIdx ];   
   Index i = slicePointer + rowIdx - sliceIdx * SliceSize;
   const Index rowEnd = i + rowLength * SliceSize;
   Real result( 0.0 );
   Index columnIndex;
   while( i < rowEnd && ( columnIndex = columnIndexes[ i ] ) < columns )
   {
      result += values[ i ] * inVector[ columnIndex ];
      i += SliceSize;
   }
   outVector[ rowIdx ] = result;   
}
#endif


template<>
class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
{
@@ -862,13 +898,13 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
                                 const InVector& inVector,
                                 OutVector& outVector )
      {
         tnlMatrixVectorProductCuda( matrix, inVector, outVector );
         /*#ifdef HAVE_CUDA    
         //tnlMatrixVectorProductCuda( matrix, inVector, outVector );
         #ifdef HAVE_CUDA    
            typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix;
            typedef typename Matrix::IndexType IndexType;
            Matrix* kernel_this = tnlCuda::passToDevice( matrix );
            InVector* kernel_inVector = tnlCuda::passToDevice( inVector );
            OutVector* kernel_outVector = tnlCuda::passToDevice( outVector );
            //Matrix* kernel_this = tnlCuda::passToDevice( matrix );
            //InVector* kernel_inVector = tnlCuda::passToDevice( inVector );
            //OutVector* kernel_outVector = tnlCuda::passToDevice( outVector );
            dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
            const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
            const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
@@ -876,18 +912,26 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
            {
               if( gridIdx == cudaGrids - 1 )
                  cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
               tnlMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>>
                                              ( kernel_this,
                                                kernel_inVector,
                                                kernel_outVector,
               tnlSlicedEllpackMatrixVectorProductCudaKernel
               < Real, Index, SliceSize >
                <<< cudaGridSize, cudaBlockSize >>>
                ( matrix.getRows(),
                  matrix.getColumns(),
                  matrix.slicePointers.getData(),
                  matrix.sliceCompressedRowsLengths.getData(),
                  matrix.columnIndexes.getData(),
                  matrix.values.getData(),
                  inVector.getData(),
                  outVector.getData(),
                  gridIdx );
               checkCudaDevice;
            }
            tnlCuda::freeFromDevice( kernel_this );
            tnlCuda::freeFromDevice( kernel_inVector );
            tnlCuda::freeFromDevice( kernel_outVector );
            //tnlCuda::freeFromDevice( kernel_this );
            //tnlCuda::freeFromDevice( kernel_inVector );
            //tnlCuda::freeFromDevice( kernel_outVector );
            checkCudaDevice;
         #endif*/
            cudaThreadSynchronize();
         #endif
      }

};
+55 −6
Original line number Diff line number Diff line
@@ -22,12 +22,49 @@
#include <core/vectors/tnlVector.h>
#include <core/tnlTimerRT.h>
#include <matrices/tnlSlicedEllpackMatrix.h>
#include <matrices/tnlEllpackMatrix.h>

#ifdef HAVE_CUBLAS
//#include <cublas.h>
#endif    

template< typename Matrix >
__global__ void setCudaTestMatrixKernel( Matrix* matrix,
                                         const int elementsPerRow,
                                         const int gridIdx )
{
   const int rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   if( rowIdx >= matrix->getRows() )
      return;
   int col = Max( 0, rowIdx - elementsPerRow / 2 );   
   for( int element = 0; element < elementsPerRow; element++ )
   {
      if( col + element < matrix->getColumns() )
         matrix->setElementFast( rowIdx, col + element, 1.0 );
   }      
}

template< typename Matrix >
void setCudaTestMatrix( Matrix& matrix,
                        const int elementsPerRow )
{
   typedef typename Matrix::IndexType IndexType;
   typedef typename Matrix::RealType RealType;
   Matrix* kernel_matrix = tnlCuda::passToDevice( matrix );
   dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
   const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
   {
      if( gridIdx == cudaGrids - 1 )
         cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
      setCudaTestMatrixKernel< Matrix >
       <<< cudaGridSize, cudaBlockSize >>>
       ( kernel_matrix, elementsPerRow, gridIdx );
      checkCudaDevice;
   }
   tnlCuda::freeFromDevice( kernel_matrix );
}

int main( int argc, char* argv[] )
{
@@ -195,8 +232,9 @@ int main( int argc, char* argv[] )
    * Sliced Ellpack test
    */
   const int elementsPerRow( 5 );
   tnlSlicedEllpackMatrix< double, tnlHost > hostMatrix;
   tnlSlicedEllpackMatrix< double, tnlCuda > deviceMatrix;
   typedef tnlEllpackMatrix< double, tnlCuda > DeviceMatrix;
   tnlEllpackMatrix< double, tnlHost > hostMatrix;
   DeviceMatrix deviceMatrix;
   tnlVector< int, tnlHost, int > hostRowLengths;
   tnlVector< int, tnlCuda, int > deviceRowLengths;
   hostRowLengths.setSize( size );
@@ -206,21 +244,28 @@ int main( int argc, char* argv[] )
   hostRowLengths.setValue( elementsPerRow );
   deviceRowLengths.setValue( elementsPerRow );
   hostMatrix.setCompressedRowsLengths( hostRowLengths );
   deviceMatrix.setCompressedRowsLengths( deviceRowLengths );
   if( ! deviceMatrix.setCompressedRowsLengths( deviceRowLengths ) )
   {
      cerr << "Unable to allocate matrix elements." << endl;
      return false;
   }
   int elements( 0 );
   for( int row = 0; row < size; row++ )
   {
      cout << "Row " << row << "/" << size << "     \r" << flush;
      int col = Max( 0, row - elementsPerRow / 2 );   
      for( int element = 0; element < elementsPerRow; element++ )
      {
         if( col + element < size )
         {
            hostMatrix.setElement( row, col + element, 1.0 );
            deviceMatrix.setElement( row, col + element, 1.0 );
            //deviceMatrix.setElement( row, col + element, 1.0 );
            elements++;
         }
      }      
   }
   cout << endl;
   setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
   datasetSize = loops * elements * sizeof( double ) / oneGB;
   hostVector.setValue( 1.0 );
   deviceVector.setValue( 1.0 );
@@ -242,7 +287,11 @@ int main( int argc, char* argv[] )
      
   if( hostVector2 != deviceVector2 )
   {      
      cerr << "Error in SliceEllpack Spmv kernel." << endl;
      cerr << "Error in SliceEllpack Spmv kernel at positions" << endl;
      //for( int i = 0; i < size; i++ )
      //   if( hostVector2.getElement( i ) != deviceVector2.getElement( i ) )
      //      cerr << " " << i;
         
   }
   bandwidth = 2 * datasetSize / loops / timer.getTime();
   cout << timer.getTime() << " => " << bandwidth << " GB/s" << " speedup " << hostTime / timer.getTime() << endl;
+1 −1

File changed.

Contains only whitespace changes.