Commit 39b4889c authored by Matouš Fencl's avatar Matouš Fencl Committed by Tomáš Oberhuber
Browse files

Last repair of FIM for GPU.

parent a6cfb604
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -148,7 +148,7 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I

template < typename Index >
__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
                               /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY );
                               TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );

template < typename Real, typename Device, typename Index >
__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
+36 −36
Original line number Diff line number Diff line
@@ -134,6 +134,7 @@ updateBlocks( InterfaceMapType interfaceMap,
      Real hy = mesh.getSpaceSteps().y();
      
      bool changed = false;
      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
      
      
      Real *sArray;
@@ -143,41 +144,40 @@ updateBlocks( InterfaceMapType interfaceMap,
      
      for( int thri = 0; thri < sizeSArray; thri++ ){
        for( int thrj = 0; thrj < sizeSArray; thrj++ )
          sArray/*[i]*/[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max();
          sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max();
      }
      
      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
      
      for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
      {        
        if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
          sArray/*[i]*/[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
          sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
        
        
        if( blIdx != 0 && thrj+1 < ykolik )
          sArray/*[i]*/[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
          sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
        
        if( dimY > (blIdy+1) * numThreadsPerBlock  && thrj+1 < xkolik )
          sArray/*[i]*/[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
          sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
        
        if( blIdy != 0 && thrj+1 < xkolik )
          sArray/*[i]*/[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
          sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
      }
      
      for( int k = 0; k < numThreadsPerBlock; k++ ){
        for( int l = 0; l < numThreadsPerBlock; l++ )
          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
            sArray/*[i]*/[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
            sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
      }
      bool pom = false;
      
      for( int k = 0; k < numThreadsPerBlock; k++ ){ 
        for( int l = 0; l < numThreadsPerBlock; l++ ){
          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){
            //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
            {
              pom = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
              changed = changed || pom;
              changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed;
              
            }
          }
        }
@@ -254,7 +254,7 @@ updateBlocks( InterfaceMapType interfaceMap,
      for( int k = 0; k < numThreadsPerBlock; k++ ){ 
        for( int l = 0; l < numThreadsPerBlock; l++ ) {
          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )      
            helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray/*[i]*/[ (k + 1)* sizeSArray + l + 1 ];
            helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ];
          //std::cout<< sArray[k+1][l+1];
        }
        //std::cout<<std::endl;
+88 −77
Original line number Diff line number Diff line
@@ -123,6 +123,7 @@ solve( const MeshPointer& mesh,
        helpFunc = helpFunc1;
        this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
        
        //Reduction      
        for( int i = 0; i < BlockIterHost.getSize(); i++ ){
          if( IsCalculationDone == 0 ){
            IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
@@ -130,6 +131,7 @@ solve( const MeshPointer& mesh,
          }
        }
        numWhile++;
        std::cout <<"numWhile = "<< numWhile <<std::endl;
        
        for( int j = numBlocksY-1; j>-1; j-- ){
          for( int i = 0; i < numBlocksX; i++ )
@@ -146,7 +148,6 @@ solve( const MeshPointer& mesh,
         std::cout << std::endl;
         }
         std::cout << std::endl;*/
        //Reduction      
        
        //std::cout<<std::endl;
        string s( "aux-"+ std::to_string(numWhile) + ".tnl");
@@ -261,9 +262,9 @@ solve( const MeshPointer& mesh,
      TNL_CHECK_CUDA_DEVICE;
      
      
      /*TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
      BlockIterPom.setSize( numBlocksX * numBlocksY  );
       BlockIterPom.setValue( 0 );*/
      BlockIterPom.setValue( 0 );
      /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
      BlockIterPom1.setSize( numBlocksX * numBlocksY  );
      BlockIterPom1.setValue( 0 );*/
@@ -284,9 +285,7 @@ solve( const MeshPointer& mesh,
       cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/
      
      
      MeshFunctionPointer helpFunc1;
      helpFunc1->setMesh(mesh);
      
      MeshFunctionPointer helpFunc1( mesh );      
      MeshFunctionPointer helpFunc( mesh );
      
      helpFunc1 = auxPtr;
@@ -343,6 +342,7 @@ solve( const MeshPointer& mesh,
        helpFunc1 = auxPtr;
        auxPtr = helpFunc;
        helpFunc = helpFunc1;
        TNL_CHECK_CUDA_DEVICE;
        
        //int pocBloku = 0;
        Devices::Cuda::synchronizeDevice();
@@ -355,10 +355,20 @@ solve( const MeshPointer& mesh,
        TNL_CHECK_CUDA_DEVICE;
        
        //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl;
        //BlockIterPom1 = BlockIterDevice;
        ///for( int i =0; i< numBlocksX; i++ ){
        //  for( int j = 0; j < numBlocksY; j++ )
        //  {
        //    std::cout << BlockIterPom1[j*numBlocksX + i];
        //  }
        //  std::cout << std::endl;
        //}
        //std::cout << std::endl;
        
         GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, numBlocksX, numBlocksY );
        GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY );
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE;
        BlockIterDevice = BlockIterPom;
        
        //std::cout<< "Probehlo" << std::endl;
        
@@ -392,7 +402,6 @@ solve( const MeshPointer& mesh,
       cudaFree( dBlock );
       delete BlockIter;*/
      cudaDeviceSynchronize();
      
      TNL_CHECK_CUDA_DEVICE;
      
      aux = *auxPtr;
@@ -410,7 +419,7 @@ solve( const MeshPointer& mesh,

template < typename Index >
__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
        /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY )
        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY )
{
  int i = blockIdx.x * 1024 + threadIdx.x;
  
@@ -430,7 +439,7 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index
      pom = 1;//BlockIterPom[ i ] = 1;
    }
    
    BlockIterDevice[ i ] = pom;//BlockIterPom[ i ];
    BlockIterPom[ i ] = pom;//BlockIterPom[ i ];
  }
}

@@ -514,14 +523,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
  int i = threadIdx.x + blockDim.x*blockIdx.x;
  int j = blockDim.y*blockIdx.y + threadIdx.y;
  /** FOR CHESS METHOD */
  if( (blockIdx.y%2  + blockIdx.x) % 2 == oddEvenBlock )
  {
    /**-----------------------------------------*/
  //if( (blockIdx.y%2  + blockIdx.x) % 2 == oddEvenBlock )
  //{
  /**------------------------------------------*/
  
  
  /** FOR FIM METHOD */
    /*if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x] )
     {*/ 
    
  if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] )
  { 
    __syncthreads();
    /**-----------------------------------------*/
    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
    __shared__ volatile int dimX;