Commit d5a890c8 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing the CUDA support for the dense matrix format.

parent 3a2e4392
Loading
Loading
Loading
Loading
+9 −11
Original line number Diff line number Diff line
@@ -33,15 +33,17 @@ class tnlCuda
#ifdef HAVE_CUDA
   __host__ __device__
#endif
   static inline tnlDeviceEnum getDevice() { return tnlCudaDevice; };
   static inline tnlDeviceEnum getDevice();

   static int getMaxGridSize();

   static void setMaxGridSize( int newMaxGridSize );

   static int getMaxBlockSize();
#ifdef HAVE_CUDA
   __host__ __device__
#endif
   static inline int getMaxGridSize();

   static void setMaxBlockSize( int newMaxBlockSize );
#ifdef HAVE_CUDA
   __host__ __device__
#endif
   static inline int getMaxBlockSize();

   static int getGPUTransferBufferSize();

@@ -64,10 +66,6 @@ class tnlCuda


   static bool checkDevice( const char* file_name, int line );

   protected:

   static int maxGridSize, maxBlockSize;
};

#define checkCudaDevice tnlCuda::checkDevice( __FILE__, __LINE__ )
+1 −0
Original line number Diff line number Diff line
@@ -376,6 +376,7 @@ ostream& operator << ( ostream& str, const tnlSharedArray< Element, Device, Inde
         str << ", " << v. getElement( i );
   }
   str << " ]";
   return str;
}

//}; // namespace implementation
+0 −24
Original line number Diff line number Diff line
@@ -18,35 +18,11 @@
#include <core/tnlCuda.h>
#include <tnlConfig.h>
 
int tnlCuda :: maxGridSize = maxCudaGridSize;
 
int tnlCuda :: maxBlockSize = maxCudaBlockSize;

tnlString tnlCuda :: getDeviceType()
{
   return tnlString( "tnlCuda" );
}

int tnlCuda :: getMaxGridSize()
{
   return maxGridSize;
}

void tnlCuda :: setMaxGridSize( int newMaxGridSize )
{
   maxGridSize = newMaxGridSize;
}

int tnlCuda :: getMaxBlockSize()
{
   return maxBlockSize;
}

void tnlCuda :: setMaxBlockSize( int newMaxBlockSize )
{
   maxBlockSize = newMaxBlockSize;
}

int tnlCuda::getGPUTransferBufferSize()
{
   return 1 << 20;
+27 −0
Original line number Diff line number Diff line
@@ -20,6 +20,33 @@

#ifdef HAVE_CUDA

#ifdef HAVE_CUDA
__host__ __device__
#endif
inline tnlDeviceEnum tnlCuda::getDevice()
{
   return tnlCudaDevice;
};

#ifdef HAVE_CUDA
__host__ __device__
#endif
inline int tnlCuda::getMaxGridSize()
{
   // TODO: make it preprocessor macro constant defined in tnlConfig
   return 65536;
};

#ifdef HAVE_CUDA
__host__ __device__
#endif
inline int tnlCuda::getMaxBlockSize()
{
   // TODO: make it preprocessor macro constant defined in tnlConfig
   return 1024;
};


template< typename ObjectType >
ObjectType* tnlCuda::passToDevice( const ObjectType& object )
{
+138 −32
Original line number Diff line number Diff line
@@ -117,6 +117,15 @@ void tnlDenseMatrix< Real, Device, Index >::reset()
   this->values.reset();
}

template< typename Real,
          typename Device,
          typename Index >
void tnlDenseMatrix< Real, Device, Index >::setValue( const Real& value )
{
   this->values.setValue( value );
}


template< typename Real,
          typename Device,
          typename Index >
@@ -160,6 +169,7 @@ bool tnlDenseMatrix< Real, Device, Index >::addElementFast( const IndexType row,
   else
      this->values.operator[]( elementIndex ) =
         thisElementMultiplicator * this->values.operator[]( elementIndex ) + value;
   return true;
}

template< typename Real,
@@ -177,6 +187,7 @@ bool tnlDenseMatrix< Real, Device, Index >::addElement( const IndexType row,
   else
      this->values.setElement( elementIndex,
                               thisElementMultiplicator * this->values.getElement( elementIndex ) + value );
   return true;
}


@@ -316,15 +327,33 @@ template< typename Real,
          typename Device,
          typename Index >
   template< typename Vector >
#ifdef HAVE_CUDA
   __device__ __host__
#endif
typename Vector::RealType tnlDenseMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                   const Vector& vector ) const
{
   RealType sum( 0.0 );
   for( IndexType column = 0; column < this->getColumns(); column++ )
      sum += this->getElement( row, column ) * vector.getElement( column );
      sum += this->getElementFast( row, column ) * vector[ column ];
   return sum;
}

#ifdef HAVE_CUDA
template< typename Real,
          typename Index,
          typename Vector >
__global__ void tnlDenseMatrixVectorProductCudaKernel( tnlDenseMatrix< Real, tnlCuda, Index >* matrix,
                                                       const Vector* inVector,
                                                       Vector* outVector,
                                                       const Index gridIdx )
{
   const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   if( rowIdx < matrix->getRows() )
      ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
}
#endif

template< typename Real,
          typename Device,
          typename Index >
@@ -343,8 +372,31 @@ void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const Vector& inVecto
                    << "Vector size: " << outVector.getSize() << endl
                    << "Vector name: " << outVector.getName() << endl );

   if( Device::getDevice() == tnlHostDevice )
      for( IndexType row = 0; row < this->getRows(); row++ )
         outVector[ row ] = rowVectorProduct( row, inVector );
   if( Device::getDevice() == tnlCudaDevice )
   {
#ifdef HAVE_CUDA
      ThisType* kernel_this = tnlCuda::passToDevice( *this );
      Vector* kernel_inVector = tnlCuda::passToDevice( inVector );
      Vector* kernel_outVector = tnlCuda::passToDevice( outVector );
      dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
      const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize.x );
      const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
      {
         if( gridIdx == cudaGrids - 1 )
            cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
         tnlDenseMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>>
                                                 ( kernel_this, kernel_inVector, kernel_outVector, gridIdx );
      }
      tnlCuda::freeFromDevice( kernel_this );
      tnlCuda::freeFromDevice( kernel_inVector );
      tnlCuda::freeFromDevice( kernel_outVector );
      checkCudaDevice;
#endif
   }
}

template< typename Real,
@@ -365,20 +417,29 @@ void tnlDenseMatrix< Real, Device, Index >::addMatrix( const Matrix& matrix,
                 << "That matrix name: " << matrix.getName() << endl );

   if( thisMatrixMultiplicator == 1.0 )
   {
      for( IndexType row = 0; row < this->getRows(); row++ )
         for( IndexType column = 0; column < this->getColumns(); column++ )
            this->operator()( row, column ) += matrixMultiplicator*matrix( row, column );
   }
      this->values.alphaXPlusY( matrixMultiplicator, matrix.values );
   else
   {
      for( IndexType row = 0; row < this->getRows(); row++ )
         for( IndexType column = 0; column < this->getColumns(); column++ )
            this->operator()( row, column ) =
                thisMatrixMultiplicator * this->operator()( row, column) +
                   matrixMultiplicator * matrix( row, column );
      this->values.alphaXPlusBetaY( matrixMultiplicator, matrix.values, thisMatrixMultiplicator );
}

#ifdef HAVE_CUDA
template< typename Real,
          typename Index,
          typename Matrix1,
          typename Matrix2,
          int tileDim,
          int tileRowBlockSize >
__global__ void tnlDenseMatrixMatrixProductKernel( tnlDenseMatrix< Real, tnlCuda, Index >* reusltMatrix,
                                                   const Matrix1* matrix1,
                                                   const Matrix2* matrix2,
                                                   const Real matrix1Multiplicator,
                                                   const Real matrix2Multiplicator,
                                                   const Index gridIdx_x,
                                                   const Index gridIdx_y )
{

}
#endif

template< typename Real,
          typename Device,
@@ -402,6 +463,7 @@ void tnlDenseMatrix< Real, Device, Index >::getMatrixProduct( const Matrix1& mat
                 << "Matrix2 rows: " << matrix2.getRows() << endl
                 << "Matrix2 name: " << matrix2.getName() << endl );

   if( Device::getDevice() == tnlHostDevice )
      for( IndexType i = 0; i < this->getRows(); i += tileDim )
         for( IndexType j = 0; j < this->getColumns(); j += tileDim )
         {
@@ -409,7 +471,7 @@ void tnlDenseMatrix< Real, Device, Index >::getMatrixProduct( const Matrix1& mat
            const IndexType tileColumns = Min( tileDim, this->getColumns() - j );
            for( IndexType i1 = i; i1 < i + tileRows; i1++ )
               for( IndexType j1 = j; j1 < j + tileColumns; j1++ )
               this->operator()( i1, j1 ) = 0.0;
                  this->setElementFast( i1, j1, 0.0 );

            for( IndexType k = 0; k < matrix1.getColumns(); k += tileDim )
            {
@@ -417,9 +479,53 @@ void tnlDenseMatrix< Real, Device, Index >::getMatrixProduct( const Matrix1& mat
               for( IndexType i1 = 0; i1 < tileRows; i1++ )
                  for( IndexType j1 = 0; j1 < tileColumns; j1++ )
                     for( IndexType k1 = k; k1 < lastK; k1++ )
                     this->operator()( i + i1, j + j1 ) +=
                        matrix1( i + i1, k1 ) * matrix2( k1, j + j1 );
                        this->addElementFast( i + i1, j + j1,
                            matrix1.getElementFast( i + i1, k1 ) * matrix2.getElementFast( k1, j + j1 ) );
            }
         }
   if( Device::getDevice() == tnlCudaDevice )
   {
#ifdef HAVE_CUDA
      dim3 cudaBlockSize( 0 ), cudaGridSize( 0 );
      const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim );
      const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim );
      cudaBlockSize.x = blockXSize;
      cudaBlockSize.y = blockYSize;
      const IndexType rowGrids = roundUpDivision( rowTiles, tnlCuda::getMaxGridSize() );
      const IndexType columnGrids = roundUpDivision( columnTiles, tnlCuda::getMaxGridSize() );

      for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ )
         for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ )
         {
            cudaGridSize.x = cudaGridSize.y = tnlCuda::getMaxGridSize();
            if( gridIdx_x == columnGrids - 1 )
               cudaGridSize.x = columnTiles % tnlCuda::getMaxGridSize();
            if( gridIdx_y == rowGrids - 1 )
               cudaGridSize.y = rowTiles % tnlCuda::getMaxGridSize();
            ThisType* this_kernel = tnlCuda::passToDevice( *this );
            Matrix1* matrix1_kernel = tnlCuda::passToDevice( matrix1 );
            Matrix2* matrix2_kernel = tnlCuda::passToDevice( matrix2 );
            tnlDenseMatrixMatrixProductKernel< Real,
                                               Index,
                                               Matrix1,
                                               Matrix2,
                                               tileDim,
                                               tileRowBlockSize >
                                           <<< cudaGridSize,
                                               cudaBlockSize,
                                               3*tileDim*tileDim >>>
                                             ( this_kernel,
                                               matrix1_kernel,
                                               matrix2_kernel,
                                               matrix1Multiplicator,
                                               matrix2Multiplicator,
                                               gridIdx_x,
                                               gridIdx_y );
            tnlCuda::freeFromDevice( this_kernel );
            tnlCuda::freeFromDevice( matrix1_kernel );
            tnlCuda::freeFromDevice( matrix2_kernel );
         }
#endif
   }
}

Loading