Commit 08ec37be authored by Matouš Fencl's avatar Matouš Fencl Committed by Tomáš Oberhuber
Browse files

FIM method is now faster than chess method but some random error occurs.

parent 970e6448
Loading
Loading
Loading
Loading
+12 −5
Original line number Diff line number Diff line
@@ -61,6 +61,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
      typedef Index IndexType;
      typedef Functions::MeshFunction< MeshType > MeshFunctionType;
      typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
      typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;

@@ -76,6 +77,11 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
      __cuda_callable__ bool updateCell( volatile Real sArray[18][18],
                                         int thri, int thrj, const Real hx, const Real hy,
                                         const Real velocity = 1.0 );
      void updateBlocks( InterfaceMapType interfaceMap,
                         MeshFunctionType aux,
                         ArrayContainer BlockIterHost, int numThreadsPerBlock );
      
      void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
};

template< typename Real,
@@ -132,14 +138,15 @@ template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 );

template < typename Index >
__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );

/*template < typename Real, typename Device, typename Index >
__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/
template < typename Index >
__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
                               /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY );

template < typename Real, typename Device, typename Index >
__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
@@ -155,7 +162,7 @@ template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
                                      int *BlockIterDevice );
                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
#endif

#include "tnlDirectEikonalMethodsBase_impl.h"
+193 −0
Original line number Diff line number Diff line
@@ -89,6 +89,199 @@ initInterface( const MeshFunctionPointer& _input,
    }
}

template< typename Real,
          typename Device,
          typename Index >
void 
tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
updateBlocks( InterfaceMapType interfaceMap,
              MeshFunctionType aux,
              ArrayContainer BlockIterHost, int numThreadsPerBlock )
{
  for( int i = 0; i < BlockIterHost.getSize(); i++ )
  {
    if( BlockIterHost[ i ] )
    {
      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
    
      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
      int numOfBlockx = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
      int numOfBlocky = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
      int xkolik = numThreadsPerBlock + 1;
      int ykolik = numThreadsPerBlock + 1;
      
      int blIdx = i%numOfBlockx;
      int blIdy = i/numOfBlocky;
      
      if( numOfBlockx - 1 == blIdx )
        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
      
      if( numOfBlocky -1 == blIdy )
        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
    
        
      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
      changed[ 0 ] = 1;*/
      Real hx = mesh.getSpaceSteps().x();
      Real hy = mesh.getSpaceSteps().y();
      
      Real changed1[ 16*16 ];
      /*Real changed2[ 16*16 ];
      Real changed3[ 16*16 ];
      Real changed4[ 16*16 ];*/
      Real sArray[18][18];
      
      for( int thri = 0; thri < numThreadsPerBlock + 2; thri++ )
        for( int thrj = 0; thrj < numThreadsPerBlock + 2; thrj++ )
          sArray[thrj][thri] = std::numeric_limits< Real >::max();
    
      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
    
      for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
      {        
        if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
          sArray[thrj+1][xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
        else
         sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
      
    
        if( blIdx != 0 && thrj+1 < ykolik )
          sArray[thrj+1][0] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
        else
          sArray[thrj+1][0] = std::numeric_limits< Real >::max();
    
        if( dimY > (blIdy+1) * numThreadsPerBlock  && thrj+1 < xkolik )
          sArray[ykolik][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
        else
          sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
      
        if( blIdy != 0 && thrj+1 < xkolik )
          sArray[0][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
        else
          sArray[0][thrj+1] = std::numeric_limits< Real >::max();
      }
    
      for( int k = 0; k < numThreadsPerBlock; k++ )
        for( int l = 0; l < numThreadsPerBlock; l++ ) 
          sArray[k+1][l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];

      for( int k = 0; k < numThreadsPerBlock; k++ ) 
        for( int l = 0; l < numThreadsPerBlock; l++ ){
          changed1[ k*numThreadsPerBlock + l ] = 0;
          /*changed2[ k*numThreadsPerBlock + l ] = 0;
          changed3[ k*numThreadsPerBlock + l ] = 0;
          changed4[ k*numThreadsPerBlock + l ] = 0;*/
          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
          {
            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
            {
              changed1[ k*numThreadsPerBlock + l ] = this->updateCell( sArray, l+1, k+1, hx,hy);
            }
          }
        }

      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
        for( int l = 0; l < numThreadsPerBlock; l++ ) { 
          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
          {
            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
            {
              /*changed2[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
            }
          }
        }

      for( int k = 0; k < numThreadsPerBlock; k++ ) 
        for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
          {
            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
            {
              /*changed3[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
            }
          }
        }

      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
        for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
          {
            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
            {
              /*changed4[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
            }
          }
        }

      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
        for( int l = numThreadsPerBlock-1; l >-1; l-- ){
          changed1[ 0 ] = changed1[ 0 ] || changed1[ k*numThreadsPerBlock + l ];
          /*changed2[ 0 ] = changed2[ 0 ] || changed2[ k*numThreadsPerBlock + l ];
          changed3[ 0 ] = changed3[ 0 ] || changed3[ k*numThreadsPerBlock + l ];
          changed4[ 0 ] = changed4[ 0 ] || changed4[ k*numThreadsPerBlock + l ];*/
        }
      
      if( changed1[ 0 ] /*|| changed2[ 0 ] ||changed3[ 0 ] ||changed4[ 0 ]*/ )
        BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1;

      for( int k = 0; k < numThreadsPerBlock; k++ ){ 
        for( int l = 0; l < numThreadsPerBlock; l++ ) {       
          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY &&
              (!interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ]) )
            aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray[ k + 1 ][ l + 1 ];
          //std::cout<< sArray[k+1][l+1];
        }
        //std::cout<<std::endl;
      }
    }
  }
}

template< typename Real,
          typename Device,
          typename Index >
void 
tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )
{
  int* BlockIterPom; 
  BlockIterPom = new int [numBlockX * numBlockY];
  
  for(int i = 0; i < numBlockX * numBlockY; i++)
  {
    BlockIterPom[ i ] = 0;  
    if( BlockIterHost[ i ] )
    {
      // i = k*numBlockY + m;
      int m=0, k=0;
      m = i%numBlockX;
      k = i/numBlockX;
      if( k > 0 )
        BlockIterPom[i - numBlockX] = 1;
      if( k < numBlockY - 1 )
        BlockIterPom[i + numBlockX] = 1;
      
      if( m < numBlockX - 1 )
        BlockIterPom[ i+1 ] = 1;
      if( m > 0 )
        BlockIterPom[ i-1 ] = 1;
    }
  }
  for(int i = 0; i < numBlockX * numBlockY; i++ )
      //if( !BlockIter[ i ] )
        BlockIterHost[ i ] = BlockIterPom[ i ];
      /*else
        BlockIter[ i ] = 0;*/
  /*for( int i = numBlockX-1; i > -1; i-- )
  {
      for( int j = 0; j< numBlockY; j++ )
          std::cout << BlockIterHost[ i*numBlockY + j ];
      std::cout << std::endl;
  }
  std::cout << std::endl;*/
  delete[] BlockIterPom;
}

template< typename Real,
          typename Device,
          typename Index >
+2 −1
Original line number Diff line number Diff line
@@ -89,6 +89,7 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
      using typename BaseType::MeshFunctionType;
      using typename BaseType::InterfaceMapPointer;
      using typename BaseType::MeshFunctionPointer;
      using typename BaseType::ArrayContainer;

      FastSweepingMethod();
      
+398 −473

File changed.

Preview size limit exceeded, changes collapsed.

+24 −9
Original line number Diff line number Diff line
@@ -258,13 +258,21 @@ solve( const MeshPointer& mesh,
                 
          tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
          
          int *BlockIterDevice;
          
          int BlockIterD = 1;
          
          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );
          TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
          BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ );
          BlockIterDevice.setValue( 1 );
          /*int *BlockIterDevice;
          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/
          int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
          int *dBlock;
          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
          
          TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
          dBlock.setSize( nBlocks );
          dBlock.setValue( 0 );
          /*int *dBlock;
          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
          
          while( BlockIterD )
          {
@@ -272,17 +280,24 @@ solve( const MeshPointer& mesh,
                                                              interfaceMapPtr.template getData< Device >(),
                                                              auxPtr.template modifyData< Device>(),
                                                              BlockIterDevice );
            //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
            //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            
            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            
            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
                                   
            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
            
          }
          cudaFree( BlockIterDevice );
          cudaFree( dBlock );
          //cudaFree( BlockIterDevice );
          //cudaFree( dBlock );
          cudaDeviceSynchronize();
          TNL_CHECK_CUDA_DEVICE;
          aux = *auxPtr;
@@ -302,7 +317,7 @@ template < typename Real, typename Device, typename Index >
__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
                                      int *BlockIterDevice )
                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
{
    int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
    int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;