Commit 3e00d6f0 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing the Ellpack format in CUDA.

parent a19246ae
Loading
Loading
Loading
Loading
+10 −0
Original line number Diff line number Diff line
@@ -79,6 +79,16 @@ inline int roundUpDivision( const int num, const int div )
{
   return num / div + ( num % div != 0 );
}

#ifdef HAVE_CUDA
__device__ __host__
#endif
inline int roundToMultiple( int number, int multiple )
{
   return multiple*( number/ multiple + ( number % multiple != 0 ) );
}


/*template< typename T >
void swap( T& a, T& b)
{
+9 −0
Original line number Diff line number Diff line
@@ -28,6 +28,8 @@ class tnlCuda
{
   public:

   enum { DeviceType = tnlCudaDevice };

   static tnlString getDeviceType();

#ifdef HAVE_CUDA
@@ -63,6 +65,13 @@ static inline int getWarpSize();
   template< typename ObjectType >
   static ObjectType* passToDevice( const ObjectType& object );

   template< typename ObjectType >
   static ObjectType passFromDevice( const ObjectType& object );

   template< typename ObjectType >
   static void passFromDevice( const ObjectType& deviceObject,
                               ObjectType& hostObject );

   template< typename ObjectType >
   static void freeFromDevice( ObjectType* object );

+2 −0
Original line number Diff line number Diff line
@@ -26,6 +26,8 @@ class tnlHost
{
   public:

   enum { DeviceType = tnlHostDevice };

   static tnlString getDeviceType();

#ifdef HAVE_CUDA
+23 −1
Original line number Diff line number Diff line
@@ -64,7 +64,6 @@ inline int tnlCuda::getNumberOfSharedMemoryBanks()
   return 32;
}


template< typename ObjectType >
ObjectType* tnlCuda::passToDevice( const ObjectType& object )
{
@@ -87,6 +86,29 @@ ObjectType* tnlCuda::passToDevice( const ObjectType& object )
   return deviceObject;
}

template< typename ObjectType >
ObjectType tnlCuda::passFromDevice( const ObjectType& object )
{
   ObjectType aux;
   cudaMemcpy( ( void* ) &aux,
               ( void* ) &object,
               sizeof( ObjectType ),
               cudaMemcpyDeviceToHost );
   checkCudaDevice;
   return aux;
}

template< typename ObjectType >
void tnlCuda::passFromDevice( const ObjectType& deviceObject,
                              ObjectType& hostObject )
{
   cudaMemcpy( ( void* ) &hostObject,
               ( void* ) &deviceObject,
               sizeof( ObjectType ),
               cudaMemcpyDeviceToHost );
   checkCudaDevice;
}

template< typename ObjectType >
void tnlCuda::freeFromDevice( ObjectType* deviceObject )
{
+3 −23
Original line number Diff line number Diff line
@@ -352,7 +352,7 @@ typename Vector::RealType tnlDenseMatrix< Real, Device, Index >::rowVectorProduc
   return sum;
}

#ifdef HAVE_CUDA
/*#ifdef HAVE_CUDA
template< typename Real,
          typename Index,
          typename Vector >
@@ -365,7 +365,7 @@ __global__ void tnlDenseMatrixVectorProductCudaKernel( tnlDenseMatrix< Real, tnl
   if( rowIdx < matrix->getRows() )
      ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
}
#endif
#endif*/

template< typename Real,
          typename Device,
@@ -389,27 +389,7 @@ void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const Vector& inVecto
      for( IndexType row = 0; row < this->getRows(); row++ )
         outVector[ row ] = rowVectorProduct( row, inVector );
   if( Device::getDevice() == tnlCudaDevice )
   {
#ifdef HAVE_CUDA
      ThisType* kernel_this = tnlCuda::passToDevice( *this );
      Vector* kernel_inVector = tnlCuda::passToDevice( inVector );
      Vector* kernel_outVector = tnlCuda::passToDevice( outVector );
      dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
      const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize.x );
      const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
      {
         if( gridIdx == cudaGrids - 1 )
            cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
         tnlDenseMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>>
                                                 ( kernel_this, kernel_inVector, kernel_outVector, gridIdx );
      }
      tnlCuda::freeFromDevice( kernel_this );
      tnlCuda::freeFromDevice( kernel_inVector );
      tnlCuda::freeFromDevice( kernel_outVector );
      checkCudaDevice;
#endif
   }
      tnlMatrixVectorProductCuda( *this, inVector, outVector );
}

template< typename Real,
Loading