Commit 190cc1a7 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing tridiagonal matrix in CUDA.

parent 8ae66b7a
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -14,6 +14,8 @@
/Debug
/Release

.settings

# /src/
/src/Makefile.in

+1 −0
Original line number Diff line number Diff line
@@ -34,6 +34,7 @@ endif()

if( WITH_CUDA STREQUAL "yes" )
   AddCompilerFlag( "-DHAVE_NOT_CXX11 -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128" )
   set( CUDA_LINKER_OPTIONS "-arch sm_20 -shared" ) # TODO: fix this in the rest of cmake files
else()
   AddCompilerFlag( "-std=gnu++0x" )
endif()      
+1 −1
Original line number Diff line number Diff line
@@ -2,7 +2,7 @@

TARGET=TNL
INSTALL_PREFIX=${HOME}/local
WITH_CUDA=no
WITH_CUDA=yes
WITH_CUSPARSE=no
CUDA_ARCHITECTURE=2.0
TEMPLATE_EXPLICIT_INSTANTIATION=yes
+216 −66
Original line number Diff line number Diff line
@@ -21,6 +21,85 @@
#include <core/tnlAssert.h>
#include <matrices/tnlTridiagonalMatrix.h>

template< typename Device >
class tnlTridiagonalMatrixDeviceDependentCode;

template<>
class tnlTridiagonalMatrixDeviceDependentCode< tnlHost >
{
   public:

      template< typename Index >
      static Index getElementIndex( const Index rows,
                                    const Index row,
                                    const Index column )
      {
         return 3*row + column - row;
      }

      template< typename Vector,
                typename Index,
                typename ValuesType  >
      static typename Vector::RealType rowVectorProduct( const Index rows,
                                                         const ValuesType& values,
                                                         const Index row,
                                                         const Vector& vector )
      {
         if( row == 0 )
            return vector[ 0 ] * values[ 0 ] +
                   vector[ 1 ] * values[ 1 ];
         Index i = 3 * row - 1;
         if( row == rows - 1 )
            return vector[ row - 1 ] * values[ i++ ] +
                   vector[ row ] * values[ i ];
         return vector[ row - 1 ] * values[ i++ ] +
                vector[ row ] * values[ i++ ] +
                vector[ row + 1 ] * values[ i ];
      }
};

template<>
class tnlTridiagonalMatrixDeviceDependentCode< tnlCuda >
{
   public:

      template< typename Index >
#ifdef HAVE_CUDA
      __device__ __host__
#endif
      static Index getElementIndex( const Index rows,
                                    const Index row,
                                    const Index column )
      {
         return ( column - row + 1 )*rows + row - 1;
      }
      
      template< typename Vector,
                typename Index,
                typename ValuesType >
#ifdef HAVE_CUDA
      __device__
#endif
      static typename Vector::RealType rowVectorProduct( const Index rows,
                                                         const ValuesType& values,
                                                         const Index row,
                                                         const Vector& vector )
      {
         if( row == 0 )
            return vector[ 0 ] * values[ 0 ] +
                   vector[ 1 ] * values[ rows - 1 ];
         Index i = row - 1;
         if( row == rows - 1 )
            return vector[ row - 1 ] * values[ i ] +
                   vector[ row ] * values[ i + rows ];
         return vector[ row - 1 ] * values[ i ] +
                vector[ row ] * values[ i + rows ] +
                vector[ row + 1 ] * values[ i + 2*rows ];
      }
};



template< typename Real,
          typename Device,
          typename Index >
@@ -58,6 +137,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setDimensions( const IndexType
   if( ! values.setSize( 3*Min( rows, columns ) ) )
      return false;
   this->values.setValue( 0.0 );
   return true;
}

template< typename Real,
@@ -174,7 +254,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setElementFast( const IndexTyp
                                                                  const IndexType column,
                                                                  const RealType& value )
{
   this->values.setElement( this->getElementIndex( row, column ), value );
   this->values[ this->getElementIndex( row, column ) ] = value;
   return true;
}

@@ -189,7 +269,6 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setElement( const IndexType ro
   return true;
}


template< typename Real,
          typename Device,
          typename Index >
@@ -201,7 +280,8 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addElementFast( const IndexTyp
                                                                  const RealType& value,
                                                                  const RealType& thisElementMultiplicator )
{
   this->values.addElement( this->getElementIndex( row, column ), value, thisElementMultiplicator );
   const Index i = this->getElementIndex( row, column );
   this->values[ i ] = thisElementMultiplicator*this->values[ i ] + value;
}

template< typename Real,
@@ -212,10 +292,10 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addElement( const IndexType ro
                                                              const RealType& value,
                                                              const RealType& thisElementMultiplicator )
{
   //this->values.addElement( this->getElementIndex( row, column ), value, thisElementMultiplicator );
   const Index i = this->getElementIndex( row, column );
   this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value );
}


template< typename Real,
          typename Device,
          typename Index >
@@ -231,7 +311,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setRowFast( const IndexType ro
            cerr << " elements = " << elements
                 << " this->columns = " << this->columns
                 << " this->getName() = " << this->getName() );
   return this->addRow( row, columns, values, elements, 0.0 );
   return this->addRowFast( row, columns, values, elements, 0.0 );
}

template< typename Real,
@@ -246,10 +326,9 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setRow( const IndexType row,
            cerr << " elements = " << elements
                 << " this->columns = " << this->columns
                 << " this->getName() = " << this->getName() );
   //return this->addRow( row, columns, values, elements, 0.0 );
   return this->addRow( row, columns, values, elements, 0.0 );
}


template< typename Real,
          typename Device,
          typename Index >
@@ -273,7 +352,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addRowFast( const IndexType ro
      const IndexType& column = columns[ i ];
      if( column < row - 1 || column > row + 1 )
         return false;
      addElement( row, column, values[ i ], thisRowMultiplicator );
      addElementFast( row, column, values[ i ], thisRowMultiplicator );
   }
   return true;
}
@@ -291,16 +370,16 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addRow( const IndexType row,
            cerr << " elements = " << elements
                 << " this->columns = " << this->columns
                 << " this->getName() = " << this->getName() );
   /*if( elements > 3 )
   if( elements > 3 )
      return false;
   for( IndexType i = 0; i < elements; i++ )
   {
      const IndexType& column = columns[ i ];
      const IndexType column = columns[ i ];
      if( column < row - 1 || column > row + 1 )
         return false;
      addElement( row, column, values[ i ], thisRowMultiplicator );
   }
   return true;*/
   return true;
}

template< typename Real,
@@ -314,7 +393,7 @@ Real tnlTridiagonalMatrix< Real, Device, Index >::getElementFast( const IndexTyp
{
   if( abs( column - row ) > 1 )
      return 0.0;
   return this->values.getElement( this->getElementIndex( row, column ) );
   return this->values[ this->getElementIndex( row, column ) ];
}

template< typename Real,
@@ -323,10 +402,11 @@ template< typename Real,
Real tnlTridiagonalMatrix< Real, Device, Index >::getElement( const IndexType row,
                                                              const IndexType column ) const
{
   return this->getElementFast( row, column );
   if( abs( column - row ) > 1 )
      return 0.0;
   return this->values.getElement( this->getElementIndex( row, column ) );
}


template< typename Real,
          typename Device,
          typename Index >
@@ -357,56 +437,50 @@ void tnlTridiagonalMatrix< Real, Device, Index >::getRow( const IndexType row,
                                                          IndexType* columns,
                                                          RealType* values ) const
{
   /*IndexType elementPointer( 0 );
   IndexType elementPointer( 0 );
   for( IndexType i = -1; i <= 1; i++ )
   {
      const IndexType column = row + 1;
      if( column >= 0 && column < this->getColumns() )
      {
         columns[ elementPointer ] = column;
         values[ elementPointer ] = this->values[ this->getElementIndex( row, column ) ];
         values[ elementPointer ] = this->values.getElement( this->getElementIndex( row, column ) );
         elementPointer++;
      }
   }*/
   }


template< typename Real,
          typename Device,
          typename Index >
Real& tnlTridiagonalMatrix< Real, Device, Index >::operator()( const IndexType row,
                                                               const IndexType column )
{
   return this->values[ this->getElementIndex( row, column ) ];
}

template< typename Real,
          typename Device,
          typename Index >
const Real& tnlTridiagonalMatrix< Real, Device, Index >::operator()( const IndexType row,
                                                                     const IndexType column ) const
template< typename Vector >
#ifdef HAVE_CUDA
   __device__ __host__
#endif
typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                         const Vector& vector ) const
{
   return this->values[ this->getElementIndex( row, column ) ];
   return tnlTridiagonalMatrixDeviceDependentCode< Device >::
             rowVectorProduct( this->rows,
                               this->values,
                               row,
                               vector );
}

#ifdef HAVE_CUDA
template< typename Real,
          typename Device,
          typename Index >
template< typename Vector >
typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                         const Vector& vector ) const
          typename Index,
          typename Vector >
__global__ void tnlTridiagonalMatrixVectorProductCudaKernel( tnlTridiagonalMatrix< Real, tnlCuda, Index >* matrix,
                                                             const Vector* inVector,
                                                             Vector* outVector,
                                                             const Index gridIdx )
{
   if( row == 0 )
      return vector[ 0 ] * this->values[ 0 ] +
             vector[ 1 ] * this->values[ 1 ];
   IndexType i = 3 * row - 1;
   if( row == this->rows - 1 )
      return vector[ row - 1 ] * this->values[ i++ ] +
             vector[ row ] * this->values[ i ];
   return vector[ row - 1 ] * this->values[ i++ ] +
          vector[ row ] * this->values[ i++ ] +
          vector[ row + 1 ] * this->values[ i ];
   const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   if( rowIdx < matrix->getRows() )
      ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
}
#endif

template< typename Real,
          typename Device,
@@ -426,8 +500,32 @@ void tnlTridiagonalMatrix< Real, Device, Index >::vectorProduct( const Vector& i
                    << "Vector size: " << outVector.getSize() << endl
                    << "Vector name: " << outVector.getName() << endl );

   if( Device::getDevice() == tnlHostDevice )
      for( IndexType row = 0; row < this->getRows(); row++ )
         outVector[ row ] = this->rowVectorProduct( row, inVector );

   if( Device::getDevice() == tnlCudaDevice )
   {
#ifdef HAVE_CUDA
      ThisType* kernel_this = tnlCuda::passToDevice( *this );
      Vector* kernel_inVector = tnlCuda::passToDevice( inVector );
      Vector* kernel_outVector = tnlCuda::passToDevice( outVector );
      dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
      const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize.x );
      const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
      {
         if( gridIdx == cudaGrids - 1 )
            cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
         tnlTridiagonalMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>>
                                                    ( kernel_this, kernel_inVector, kernel_outVector, gridIdx );
      }
      tnlCuda::freeFromDevice( kernel_this );
      tnlCuda::freeFromDevice( kernel_inVector );
      tnlCuda::freeFromDevice( kernel_outVector );
      checkCudaDevice;
#endif
   }
}

template< typename Real,
@@ -443,31 +541,52 @@ void tnlTridiagonalMatrix< Real, Device, Index >::addMatrix( const tnlTridiagona
                 << "This matrix rows: " << this->getRows() << endl
                 << "This matrix name: " << this->getName() << endl );

   const IndexType lastI = this->values.getSize();
   if( thisMatrixMultiplicator == 1.0 )
   {
      for( IndexType i = 0; i < lastI; i++ )
         this->values[ i ] += matrixMultiplicator*matrix.values[ i ];
   }
      this->values.alphaXPlusY( matrixMultiplicator, matrix.values );
   else
   {
      for( IndexType i = 0; i < lastI; i++ )
         this->values[ i ] = thisMatrixMultiplicator * this->values[ i ] +
            matrixMultiplicator*matrix.values[ i ];
      this->values.alphaXPlusBetaY( matrixMultiplicator, matrix.values, thisMatrixMultiplicator );
}

#ifdef HAVE_CUDA
template< typename Real,
          typename Real2,
          typename Index,
          typename Index2 >          
__global__ void tnlTridiagonalMatrixTranspositionCudaKernel( const tnlTridiagonalMatrix< Real2, tnlCuda, Index2 >* inMatrix,
                                                             tnlTridiagonalMatrix< Real, tnlCuda, Index >* outMatrix,
                                                             const Real matrixMultiplicator,
                                                             const Index gridIdx )
{
   const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   if( rowIdx < inMatrix->getRows() )
   {
      if( rowIdx > 0 )
        outMatrix->setElementFast( rowIdx-1,
                                   rowIdx,
                                   matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx-1 ) );
      outMatrix->setElementFast( rowIdx,
                                 rowIdx,
                                 matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx ) );
      if( rowIdx < inMatrix->getRows()-1 )
         outMatrix->setElementFast( rowIdx+1,
                                    rowIdx,
                                    matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx+1 ) );
   }
}
#endif

template< typename Real,
          typename Device,
          typename Index >
   template< typename Real2, typename Index2, int tileDim >
   template< typename Real2, typename Index2 >
void tnlTridiagonalMatrix< Real, Device, Index >::getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix,
                                                                    const RealType& matrixMultiplicator )
{
   tnlAssert( this->getRows() == matrix.getRows(),
               cerr << "This matrix rows: " << this->getRows() << endl
                    << "That matrix rows: " << matrix.getRows() << endl );

   if( Device::getDevice() == tnlHostDevice )
   {
      const IndexType& rows = matrix.getRows();
      for( IndexType i = 1; i < rows; i++ )
      {
@@ -477,6 +596,31 @@ void tnlTridiagonalMatrix< Real, Device, Index >::getTransposition( const tnlTri
         this->setElement( i - 1, i, aux );
      }
   }
   if( Device::getDevice() == tnlCudaDevice )
   {
#ifdef HAVE_CUDA
      ThisType* kernel_this = tnlCuda::passToDevice( *this );
      typedef  tnlTridiagonalMatrix< Real2, Device, Index2 > InMatrixType;
      InMatrixType* kernel_inMatrix = tnlCuda::passToDevice( matrix );
      dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
      const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
      const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
      {
         if( gridIdx == cudaGrids - 1 )
            cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
         tnlTridiagonalMatrixTranspositionCudaKernel<<< cudaGridSize, cudaBlockSize >>>
                                                    ( kernel_inMatrix,
                                                      kernel_this,
                                                      matrixMultiplicator,
                                                      gridIdx );
      }
      tnlCuda::freeFromDevice( kernel_this );
      tnlCuda::freeFromDevice( kernel_inMatrix );
      checkCudaDevice;
#endif
   }
}

template< typename Real,
          typename Device,
@@ -547,7 +691,7 @@ void tnlTridiagonalMatrix< Real, Device, Index >::print( ostream& str ) const
      str <<"Row: " << row << " -> ";
      for( IndexType column = row - 1; column < row + 2; column++ )
         if( column >= 0 && column < this->columns )
            str << " Col:" << column << "->" << this->operator()( row, column ) << "\t";
            str << " Col:" << column << "->" << this->getElement( row, column ) << "\t";
      str << endl;
   }
}
@@ -555,15 +699,21 @@ void tnlTridiagonalMatrix< Real, Device, Index >::print( ostream& str ) const
template< typename Real,
          typename Device,
          typename Index >
#ifdef HAVE_CUDA
   __device__ __host__
#endif
Index tnlTridiagonalMatrix< Real, Device, Index >::getElementIndex( const IndexType row,
                                                                    const IndexType column ) const
{
   // TODO: remove the #ifndef when CUDA supports std:cerr
#ifndef HAVE_CUDA   
   tnlAssert( row >= 0 && column >= 0 && row < this->rows && column < this->rows,
              cerr << " this->rows = " << this->rows
                   << " row = " << row << " column = " << column );
   tnlAssert( abs( row - column ) < 2,
              cerr << "row = " << row << " column = " << column << endl );
   return 3*row + column - row;
#endif   
   return tnlTridiagonalMatrixDeviceDependentCode< Device >::getElementIndex( this->rows, row, column );
}


+28 −7
Original line number Diff line number Diff line
@@ -32,6 +32,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
   typedef Device DeviceType;
   typedef Index IndexType;
   typedef typename tnlMatrix< Real, Device, Index >::RowLengthsVector RowLengthsVector;
   typedef tnlTridiagonalMatrix< Real, Device, Index > ThisType;

   tnlTridiagonalMatrix();

@@ -82,6 +83,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                        const RealType& value,
                        const RealType& thisElementMultiplicator = 1.0 );

   bool addElement( const IndexType row,
                    const IndexType column,
                    const RealType& value,
                    const RealType& thisElementMultiplicator = 1.0 );

#ifdef HAVE_CUDA
   __device__ __host__
#endif
@@ -90,6 +96,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                    const RealType* values,
                    const IndexType elements );

   bool setRow( const IndexType row,
                const IndexType* columns,
                const RealType* values,
                const IndexType elements );

#ifdef HAVE_CUDA
   __device__ __host__
#endif
@@ -99,6 +110,12 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                    const IndexType elements,
                    const RealType& thisRowMultiplicator = 1.0 );

   bool addRow( const IndexType row,
                const IndexType* columns,
                const RealType* values,
                const IndexType elements,
                const RealType& thisRowMultiplicator = 1.0 );

#ifdef HAVE_CUDA
   __device__ __host__
#endif
@@ -115,13 +132,14 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                    IndexType* columns,
                    RealType* values ) const;

   RealType& operator()( const IndexType row,
                         const IndexType column );

   const RealType& operator()( const IndexType row,
                               const IndexType column ) const;
   void getRow( const IndexType row,
                IndexType* columns,
                RealType* values ) const;

   template< typename Vector >
#ifdef HAVE_CUDA
   __device__ __host__
#endif
   typename Vector::RealType rowVectorProduct( const IndexType row,
                                               const Vector& vector ) const;

@@ -135,11 +153,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                   const RealType& thisMatrixMultiplicator = 1.0 );

#ifdef HAVE_NOT_CXX11
   template< typename Real2, typename Index2, int tileDim >
   template< typename Real2, typename Index2 >
   void getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix,
                          const RealType& matrixMultiplicator = 1.0 );
#else
   template< typename Real2, typename Index2, int tileDim = 32 >
   template< typename Real2, typename Index2 >
   void getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix,
                          const RealType& matrixMultiplicator = 1.0 );
#endif   
@@ -162,6 +180,9 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >

   protected:

#ifdef HAVE_CUDA
   __device__ __host__
#endif
   IndexType getElementIndex( const IndexType row,
                              const IndexType column ) const;

Loading