Commit 47f7ff04 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Fixing simple heat equation benchmark for CUDA.

parent 5ef32f45
Loading
Loading
Loading
Loading
+7 −7
Original line number Diff line number Diff line
@@ -271,16 +271,16 @@ __global__ void tnlTraverserGrid2DCells( const tnlGrid< 2, Real, tnlCuda, Index
   const int CellDimensions = GridType::Dimensions;
   typename GridType::template GridEntity< CellDimensions > entity( *grid );
   typedef typename GridType::CoordinatesType CoordinatesType;
   CoordinatesType& coordinates = entity.getCoordinates();
   //CoordinatesType& coordinates = entity.getCoordinates();

   const Index& xSize = grid->getDimensions().x();
   const Index& ySize = grid->getDimensions().y();
   /*const Index& xSize = grid->getDimensions().x();
   const Index& ySize = grid->getDimensions().y();*/

   coordinates.x() = ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   coordinates.y() = ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;  
   entity.getCoordinates().x() = ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   entity.getCoordinates().y() = ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;  

   if( coordinates.x() < grid->getDimensions().x() &&
       coordinates.y() < grid->getDimensions().y() )
   if( entity.getCoordinates().x() < grid->getDimensions().x() &&
       entity.getCoordinates().y() < grid->getDimensions().y() )
   {
      entity.setIndex( grid->getEntityIndex( entity ) );
      if( processAllEntities || entity.isBoundaryEntity() == processBoundaryEntities )
+0 −1
Original line number Diff line number Diff line
@@ -158,7 +158,6 @@ bool tnlEulerSolver< Problem > :: solve( DofVectorType& u )
         currentTau = Min( currentTau, this->getMaxTau() );
      }
   }
   return false;
};

template< typename Problem >
+265 −30
Original line number Diff line number Diff line
@@ -28,20 +28,178 @@
using namespace std;

#ifdef HAVE_CUDA
template< typename Real, typename Index >
__device__ void computeBlockResidue( Real* du,
                                     Real* blockResidue,
                                     Index n )
{
   if( n == 128 || n ==  64 || n ==  32 || n ==  16 ||
       n ==   8 || n ==   4 || n ==   2 || n == 256 ||
       n == 512 )
    {
       if( blockDim.x >= 512 )
       {
          if( threadIdx.x < 256 )
             du[ threadIdx.x ] += du[ threadIdx.x  + 256 ];
          __syncthreads();
       }
       if( blockDim.x >= 256 )
       {
          if( threadIdx.x < 128 )
             du[ threadIdx.x ] += du[ threadIdx.x  + 128 ];
          __syncthreads();
       }
       if( blockDim.x >= 128 )
       {
          if( threadIdx.x < 64 )
             du[ threadIdx.x ] += du[ threadIdx.x  + 64 ];
          __syncthreads();
       }

       /***
        * This runs in one warp so it is synchronized implicitly.
        */
       if ( threadIdx.x < 32)
       {
          if( blockDim.x >= 64 )
             du[ threadIdx.x ] += du[ threadIdx.x  + 32 ];
          if( blockDim.x >= 32 )
             du[ threadIdx.x ] += du[ threadIdx.x  + 16 ];
          if( blockDim.x >= 16 )
             du[ threadIdx.x ] += du[ threadIdx.x  + 8 ];
          if( blockDim.x >=  8 )
             du[ threadIdx.x ] += du[ threadIdx.x  + 4 ];
          if( blockDim.x >=  4 )
             du[ threadIdx.x ] += du[ threadIdx.x  + 2 ];
          if( blockDim.x >=  2 )
             du[ threadIdx.x ] += du[ threadIdx.x  + 1 ];
       }
    }
    else
    {
       int s;
       if( n >= 512 )
       {
          s = n / 2;
          if( threadIdx.x < s )
             du[ threadIdx.x ] += du[ threadIdx.x + s ];

          __syncthreads();
          if( 2 * s < n  && threadIdx.x == n - 1 )
             du[ 0 ] += du[ threadIdx.x ];
          n = s;
          __syncthreads();
       }
       if( n >= 256 )
       {
          s = n / 2;
          if( threadIdx.x < s )
             du[ threadIdx.x ] += du[ threadIdx.x + s ];

          __syncthreads();
          if( 2 * s < n  && threadIdx.x == n - 1 )
             du[ 0 ] += du[ threadIdx.x ];
          n = s;
          __syncthreads();
       }
       if( n >= 128 )
       {
          s = n / 2;
          if( threadIdx.x < s )
             du[ threadIdx.x ] += du[ threadIdx.x + s ];

          __syncthreads();
          if( 2 * s < n  && threadIdx.x == n - 1 )
             du[ 0 ] += du[ threadIdx.x ];
          n = s;
          __syncthreads();
       }
       if( n >= 64 )
       {
          s = n / 2;
          if( threadIdx.x < s )
             du[ threadIdx.x ] += du[ threadIdx.x + s ];

          __syncthreads();
          if( 2 * s < n  && threadIdx.x == n - 1 )
             du[ 0 ] += du[ threadIdx.x ];
          n = s;
          __syncthreads();

       }
       if( n >= 32 )
       {
          s = n / 2;
          if( threadIdx.x < s )
             du[ threadIdx.x ] += du[ threadIdx.x + s ];

          __syncthreads();
          if( 2 * s < n  && threadIdx.x == n - 1 )
             du[ 0 ] += du[ threadIdx.x ];
          n = s;
          __syncthreads();
       }
       /***
        * This runs in one warp so it is synchronised implicitly.
        */
       if( n >= 16 )
       {
          s = n / 2;
          if( threadIdx.x < s )
             du[ threadIdx.x ] += du[ threadIdx.x + s ];
          if( 2 * s < n  && threadIdx.x == n - 1 )
             du[ 0 ] += du[ threadIdx.x ];
          n = s;
       }
       if( n >= 8 )
       {
          s = n / 2;
          if( threadIdx.x < s )
             du[ threadIdx.x ] += du[ threadIdx.x + s ];
          if( 2 * s < n  && threadIdx.x == n - 1 )
             du[ 0 ] += du[ threadIdx.x ];
          n = s;
       }
       if( n >= 4 )
       {
          s = n / 2;
          if( threadIdx.x < s )
             du[ threadIdx.x ] += du[ threadIdx.x + s ];
          if( 2 * s < n  && threadIdx.x == n - 1 )
             du[ 0 ] += du[ threadIdx.x ];
          n = s;
       }
       if( n >= 2 )
       {
          s = n / 2;
          if( threadIdx.x < s )
             du[ threadIdx.x ] += du[ threadIdx.x + s ];
          if( 2 * s < n  && threadIdx.x == n - 1 )
             du[ 0 ] += du[ threadIdx.x ];
          n = s;
       }
    }

   if( threadIdx.x == 0 )
      blockResidue[ blockIdx.x ] = du[ 0 ];

}

template< typename Real, typename Index >
__global__ void boundaryConditionsKernel( const Real* u, Real* aux,
                                          const Index gridXSize, const Index gridYSize )
{
   const Index i = ( blockIdx.x ) * blockDim.x + threadIdx.x;
   const Index j = ( blockIdx.y ) * blockDim.y + threadIdx.y;
   if( i == 0 && j < gridYSize )
   /*if( i == 0 && j < gridYSize )
      aux[ j * gridXSize ] = 0.0; //u[ j * gridXSize + 1 ];
   if( i == gridXSize - 1 && j < gridYSize )
   /*if( i == gridXSize - 1 && j < gridYSize )
      aux[ j * gridXSize + gridXSize - 2 ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ];      
   if( j == 0 && i < gridXSize )
      aux[ j * gridXSize ] = 0.0; //u[ j * gridXSize + 1 ];
   if( j == gridYSize -1  && i < gridXSize )
      aux[ j * gridXSize + gridXSize - 2 ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ];      
    */
}


@@ -59,20 +217,58 @@ __global__ void heatEquationKernel( const Real* u,
   if( i > 0 && i < gridXSize - 1 &&
       j > 0 && j < gridYSize - 1 )
   {
      printf( "( %d, %d ) ", i, j );
      const Index c = j * gridXSize + i;
      aux[ c ] = u[ c ] + tau * ( ( u[ c - 1 ] - 2.0 * u[ c ] + u[ c + 1 ] ) * hx_inv +
      aux[ c ] = tau * ( ( u[ c - 1 ] - 2.0 * u[ c ] + u[ c + 1 ] ) * hx_inv +
                       ( u[ c - gridXSize ] - 2.0 * u[ c ] + u[ c + gridXSize ] ) * hy_inv );
   }
}

template< typename Real, typename Index >
__global__ void updateKernel( Real* u,
                              const Real* aux,
                              Real* aux,
                              Real* cudaBlockResidue,
                              const Index dofs )
{
   Index idx = blockIdx.x * blockDim.x + threadIdx.x;
   if( idx < dofs )
      u[ idx ] += aux[ idx ];
   const Index blockOffset = blockIdx.x * blockDim.x;
   Index idx = blockOffset + threadIdx.x;
   //if( idx < dofs )
   if( threadIdx.x == 0 && idx < dofs )
      printf( "%d %d %d -> %d \n", blockIdx.x, blockDim.x, blockOffset, idx );
   //   u[ idx ] += aux[ idx ];
   
   __syncthreads();

   const Index rest = dofs - blockOffset;
   Index n =  rest < blockDim.x ? rest : blockDim.x;

   //computeBlockResidue< Real, Index >( aux,
   //                                    cudaBlockResidue,
   //                                    n );
}

template< typename Real, typename Index >
bool writeFunction(
   char* fileName,
   const Real* data,
   const Index xSize,
   const Index ySize,
   const Real& hx,
   const Real& hy )
{
   fstream file;
   file.open( fileName, ios::out );
   if( ! file )
   {
      cerr << "Unable to open file " << fileName << "." << endl;
      return false;
   }
   for( Index i = 0; i < xSize; i++ )
   {
      for( Index j = 0; j < ySize; j++ )
         file << i * hx << " " << j * hy << " " << data[ j * xSize + i ] << endl;
      file << endl;
   }
}

template< typename Real, typename Index >
@@ -86,18 +282,22 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters )
   Real tau = parameters.getParameter< double >( "time-step" );
   const Real finalTime = parameters.getParameter< double >( "final-time" );
   const bool verbose = parameters.getParameter< bool >( "verbose" );
   const Index dofsCount = gridXSize * gridYSize;
   dim3 cudaUpdateBlocks( dofsCount / 256 + ( dofsCount % 256 != 0 ) );
   dim3 cudaUpdateBlockSize( 256 );
   
   /****
    * Initiation
    */   
   Real* u = new Real[ gridXSize * gridYSize ];
   Real* aux = new Real[ gridXSize * gridYSize ];
   Real* u = new Real[ dofsCount ];
   Real* aux = new Real[ dofsCount ];
   Real* max_du = new Real[ cudaUpdateBlocks.x ];
   if( ! u || ! aux )
   {
      cerr << "I am not able to allocate grid function for grid size " << gridXSize << "x" << gridYSize << "." << endl;
      return false;
   }
   const Index dofsCount = gridXSize * gridYSize;
   
   const Real hx = domainXSize / ( Real ) gridXSize;
   const Real hy = domainYSize / ( Real ) gridYSize;
   const Real hx_inv = 1.0 / ( hx * hx );
@@ -121,15 +321,22 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters )
         const Real y = j * hy - domainYSize / 2.0;      
         u[ j * gridXSize + i ] = exp( - sigma * ( x * x + y * y ) );
      }
   writeFunction( "initial", u, gridXSize, gridYSize, hx, hy );
   
   /****
    * Allocate data on the CUDA device
    */
   
   Real *cuda_u, *cuda_aux;
   cudaMalloc( &cuda_u, gridXSize * gridYSize * sizeof( Real ) );
   cudaMalloc( &cuda_aux, gridXSize * gridYSize * sizeof( Real ) );
   cudaMemcpy( cuda_u, u, gridXSize * gridYSize * sizeof( Real ),  cudaMemcpyHostToDevice );
   int cudaErr;
   Real *cuda_u, *cuda_aux, *cuda_max_du;
   cudaMalloc( &cuda_u, dofsCount * sizeof( Real ) );
   cudaMalloc( &cuda_aux, dofsCount * sizeof( Real ) );
   cudaMemcpy( cuda_u, u, dofsCount * sizeof( Real ),  cudaMemcpyHostToDevice );
   cudaMalloc( &cuda_max_du, cudaUpdateBlocks.x * sizeof( Real ) );
   if( ( cudaErr = cudaGetLastError() ) != cudaSuccess )
   {
      cerr << "Allocation failed. " << cudaErr << endl;
      return false;
   }
   
   /****
    * Explicit Euler solver
@@ -138,7 +345,8 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters )
   dim3 cudaBlockSize( 16, 16 );
   dim3 cudaGridSize( gridXSize / 16 + ( gridXSize % 16 != 0 ),
                      gridYSize / 16 + ( gridYSize % 16 != 0 ) );
   const Index dofs = gridXSize * gridYSize;
   cout << "Setting grid size to " << cudaGridSize.x << "," << cudaGridSize.y << "," << cudaGridSize.z << endl;
   cout << "Setting block size to " << cudaBlockSize.x << "," << cudaBlockSize.y << "," << cudaBlockSize.z << endl;

   if( verbose )
      cout << "Starting the solver main loop..." << endl;   
@@ -155,26 +363,49 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters )
      /****
       * Neumann boundary conditions
       */
      boundaryConditionsKernel<<< cudaGridSize, cudaBlockSize >>>( u, aux, gridXSize, gridYSize );
      //cout << "Setting boundary conditions ... " << endl;
      boundaryConditionsKernel<<< cudaGridSize, cudaBlockSize >>>( cuda_u, cuda_aux, gridXSize, gridYSize );
      if( ( cudaErr = cudaGetLastError() ) != cudaSuccess )
      {
         cerr << "Setting of boundary conditions failed. " << cudaErr << endl;
         return false;
      }
                  
      /****
       * Laplace operator
       */
      heatEquationKernel<<< cudaGridSize, cudaBlockSize >>>
         ( u, aux, tau, hx_inv, hy_inv, gridXSize, gridYSize );
      cout << "Laplace operator ... " << endl;
     //heatEquationKernel<<< cudaGridSize, cudaBlockSize >>>
     //    ( cuda_u, cuda_aux, tau, hx_inv, hy_inv, gridXSize, gridYSize );
      if( cudaGetLastError() != cudaSuccess )
      {
         cerr << "Laplace operator failed." << endl;
         return false;
      }
            
      /****
       * Update
       */            
      updateKernel<<< dofs / 256 + ( dofs % 256 != 0 ), 256 >>>( u, aux, dofs );
      
      /*Real absMax( 0.0 );
      for( Index i = 0; i < dofsCount; i++ )
      cout << "Update ... " << endl;
      updateKernel<<< cudaUpdateBlocks, cudaUpdateBlockSize >>>( u, aux, cuda_max_du, dofsCount );
      if( cudaGetLastError() != cudaSuccess )
      {
         const Real a = fabs( aux[ i ] );
         absMax = a > absMax ? a : absMax;
      }*/
         cerr << "Update failed." << endl;
         return false;
      }
      cudaThreadSynchronize();
      cudaMemcpy( max_du, cuda_max_du, cudaUpdateBlocks.x * sizeof( Real ), cudaMemcpyDeviceToHost );
      if( ( cudaErr = cudaGetLastError() ) != cudaSuccess )
      {
         cerr << "Copying max_du failed. " << cudaErr << endl;
         return false;
      }
      Real absMax( 0.0 );
      for( Index i = 0; i < cudaUpdateBlocks.x; i++ )
      {
         const Real a = fabs( max_du[ i ] );
         absMax = a > absMax ? a : absMax;
      }
            
      time += currentTau;
      iteration++;
@@ -185,6 +416,8 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters )
   if( verbose )      
      cout << endl << "Finished..." << endl;
   cout << "Computation time is " << timer.getTime() << " sec. i.e. " << timer.getTime() / ( double ) iteration << "sec. per iteration." << endl;
   cudaMemcpy( u, cuda_u, dofsCount * sizeof( Real ), cudaMemcpyDeviceToHost );
   writeFunction( "final", u, gridXSize, gridYSize, hx, hy );
   
   /***
    * Freeing allocated memory
@@ -193,8 +426,10 @@ bool solveHeatEquationCuda( const tnlParameterContainer& parameters )
      cout << "Freeing allocated memory..." << endl;
   delete[] u;
   delete[] aux;
   delete[] max_du;
   cudaFree( cuda_u );
   cudaFree( cuda_aux );
   cudaFree( cuda_max_du );
   return true;
}
#endif