Commit 8329b53f authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Tunnig experiments - new versio of CUDA boundary traverser, simpler and using more CUDA blocks.

parent 84a61937
Loading
Loading
Loading
Loading
+96 −7
Original line number Diff line number Diff line
@@ -410,17 +410,98 @@ GridTraverser2DBoundary(
   const Index endX,
   const Index beginY,
   const Index endY,
   const Index blocksPerFace,
   const dim3 gridIdx,
   const GridEntityParameters... gridEntityParameters )
{
   using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >;
   using CoordinatesType = typename GridType::CoordinatesType;
   
   Index entitiesAlongX = endX - beginX + 1;
   Index entitiesAlongY = endY - beginY;
   const Index faceIdx = blockIdx.x / blocksPerFace;
   const Index faceBlockIdx = blockIdx.x % blocksPerFace;
   const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x;
   if( faceIdx < 2 )
   {
      const Index entitiesAlongX = endX - beginX + 1;
      if( threadId < entitiesAlongX )
      {
         GridEntity entity( *grid, 
            CoordinatesType(  beginX + threadId, faceIdx == 0 ? beginY : endY ),
            gridEntityParameters... );
         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
         entity.refresh();
         EntitiesProcessor::processEntity( *grid, userData, entity );
      }
   }
   else
   {
      const Index entitiesAlongY = endY - beginY - 1;   
      if( threadId < entitiesAlongY )
      {
         GridEntity entity( *grid, 
            CoordinatesType(  faceIdx == 2 ? beginX : endX, beginY + threadId + 1  ),
            gridEntityParameters... );
         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
         entity.refresh();
         EntitiesProcessor::processEntity( *grid, userData, entity );
      }
   }
   
   
   
   /*const Index aux = max( entitiesAlongX, entitiesAlongY );
   const Index& warpSize = Devices::Cuda::getWarpSize();
   const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) );
   
   Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
   GridEntity entity( *grid, 
         CoordinatesType( 0, 0 ),
         gridEntityParameters... );
   CoordinatesType& coordinates = entity.getCoordinates();
   const Index axisIndex = threadId / threadsPerAxis;
   //printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis );   
   threadId -= axisIndex * threadsPerAxis;
   switch( axisIndex )
   {
      case 1:
         coordinates = CoordinatesType( beginX + threadId, beginY );
         if( threadId < entitiesAlongX )
         {
            //printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
            entity.refresh();
            EntitiesProcessor::processEntity( *grid, userData, entity );
         }
         break;
      case 2:
         coordinates = CoordinatesType( beginX + threadId, endY );
         if( threadId < entitiesAlongX )
         {
            //printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
            entity.refresh();
            EntitiesProcessor::processEntity( *grid, userData, entity );
         }
         break;
      case 3:
         coordinates = CoordinatesType( beginX, beginY + threadId + 1 );
         if( threadId < entitiesAlongY )
         {
            //printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
            entity.refresh();
            EntitiesProcessor::processEntity( *grid, userData, entity );
         }
         break;
      case 4:
         coordinates = CoordinatesType( endX, beginY + threadId + 1 );
         if( threadId < entitiesAlongY )
         {
            //printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
            entity.refresh();
            EntitiesProcessor::processEntity( *grid, userData, entity );
         }
         break;
   }*/
   
   /*if( threadId < entitiesAlongX )
   {
      GridEntity entity( *grid, 
         CoordinatesType( beginX + threadId, beginY ),
@@ -457,7 +538,7 @@ GridTraverser2DBoundary(
      entity.refresh();
      //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
      EntitiesProcessor::processEntity( *grid, userData, entity );
   }
   }*/
}
#endif

@@ -487,8 +568,14 @@ processEntities(
   {
      dim3 cudaBlockSize( 256 );      
      dim3 cudaBlocksCount, cudaGridsCount;
      IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 );
      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
      const IndexType entitiesAlongY = end.x() - begin.x() - 1;
      const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY );
      const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 );
      IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace;
      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
      //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount 
      //          << "cudaBlockCount = " << cudaBlocksCount.x << std::endl;      
      dim3 gridIdx, cudaGridSize;
      Devices::Cuda::synchronizeDevice();
      for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
@@ -503,9 +590,11 @@ processEntities(
                 end.x(),
                 begin.y(),
                 end.y(),
                 blocksPerFace,
                 gridIdx,
                 gridEntityParameters... );
      }
      //getchar();
      TNL_CHECK_CUDA_DEVICE;
   }
   else
+4 −4
Original line number Diff line number Diff line
@@ -593,12 +593,12 @@ getExplicitUpdate( const RealType& time,
            TNL::Devices::Cuda::synchronizeDevice();
            int cudaErr;
            Meshes::Traverser< MeshType, Cell > meshTraverser;
            /*meshTraverser.template processInteriorEntities< UserData,
            meshTraverser.template processInteriorEntities< UserData,
                                                      InteriorEntitiesProcessor >
                                                          ( mesh,
                                                            userData );
             // */
            _heatEquationKernel< InteriorEntitiesProcessor, UserData, MeshType, RealType, IndexType >
            /*_heatEquationKernel< InteriorEntitiesProcessor, UserData, MeshType, RealType, IndexType >
            <<< cudaGridSize, cudaBlockSize >>>
               ( &mesh.template getData< Devices::Cuda >(),
                userData );
@@ -609,12 +609,12 @@ getExplicitUpdate( const RealType& time,
               return;
            }
            
            /*meshTraverser.template processBoundaryEntities< UserData,
            meshTraverser.template processBoundaryEntities< UserData,
                                                      BoundaryEntitiesProcessor >
                                                          ( mesh,
                                                            userData );
            // */
           _boundaryConditionsKernel< BoundaryEntitiesProcessor, UserData, MeshType, RealType, IndexType >
           /*_boundaryConditionsKernel< BoundaryEntitiesProcessor, UserData, MeshType, RealType, IndexType >
            <<< cudaGridSize, cudaBlockSize >>>
               ( &mesh.template getData< Devices::Cuda >(),
                userData );