FIM method is now faster than chess method but some random error occurs.

444e01c4 · Matouš Fencl · f5c32276 · 444e01c4 · 444e01c4 · 444e01c4
Commit 444e01c4 authored 6 years ago by Matouš Fencl
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -61,8 +61,9 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
      typedef Index IndexType;
      typedef Functions::MeshFunction< MeshType > MeshFunctionType;
      typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
+      typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
-      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
+      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
      void initInterface( const MeshFunctionPointer& input,
                          MeshFunctionPointer& output,
@@ -76,6 +77,11 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
      __cuda_callable__ bool updateCell( volatile Real sArray[18][18],
                                         int thri, int thrj, const Real hx, const Real hy,
                                         const Real velocity = 1.0 );
+      void updateBlocks( InterfaceMapType interfaceMap,
+                         MeshFunctionType aux,
+                         ArrayContainer BlockIterHost, int numThreadsPerBlock );
+      void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
 };
 template< typename Real,
@@ -113,6 +119,8 @@ T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double
 template < typename T1 >
 __cuda_callable__ void sortMinims( T1 pom[] );
+template < typename Index >
+void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY );
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
@@ -130,11 +138,15 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice, int oddEvenBlock);
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 );
-__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks );
+template < typename Index >
+__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );
-/*template < typename Real, typename Device, typename Index >
+template < typename Index >
-__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/
+__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+                               /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY );
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
@@ -150,7 +162,7 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice );
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
 #endif
 #include "tnlDirectEikonalMethodsBase_impl.h"
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -89,6 +89,199 @@ initInterface( const MeshFunctionPointer& _input,
    }
 }
+template< typename Real,
+          typename Device,
+          typename Index >
+void 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+updateBlocks( InterfaceMapType interfaceMap,
+              MeshFunctionType aux,
+              ArrayContainer BlockIterHost, int numThreadsPerBlock )
+{
+  for( int i = 0; i < BlockIterHost.getSize(); i++ )
+  {
+    if( BlockIterHost[ i ] )
+    {
+      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
+      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+      int numOfBlockx = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlocky = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      int xkolik = numThreadsPerBlock + 1;
+      int ykolik = numThreadsPerBlock + 1;
+      int blIdx = i%numOfBlockx;
+      int blIdy = i/numOfBlocky;
+      if( numOfBlockx - 1 == blIdx )
+        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
+      if( numOfBlocky -1 == blIdy )
+        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
+      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
+      changed[ 0 ] = 1;*/
+      Real hx = mesh.getSpaceSteps().x();
+      Real hy = mesh.getSpaceSteps().y();
+      Real changed1[ 16*16 ];
+      /*Real changed2[ 16*16 ];
+      Real changed3[ 16*16 ];
+      Real changed4[ 16*16 ];*/
+      Real sArray[18][18];
+      for( int thri = 0; thri < numThreadsPerBlock + 2; thri++ )
+        for( int thrj = 0; thrj < numThreadsPerBlock + 2; thrj++ )
+          sArray[thrj][thri] = std::numeric_limits< Real >::max();
+      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
+      for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
+      {        
+        if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
+          sArray[thrj+1][xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
+        else
+         sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
+        if( blIdx != 0 && thrj+1 < ykolik )
+          sArray[thrj+1][0] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
+        else
+          sArray[thrj+1][0] = std::numeric_limits< Real >::max();
+        if( dimY > (blIdy+1) * numThreadsPerBlock  && thrj+1 < xkolik )
+          sArray[ykolik][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
+        else
+          sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
+        if( blIdy != 0 && thrj+1 < xkolik )
+          sArray[0][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
+        else
+          sArray[0][thrj+1] = std::numeric_limits< Real >::max();
+      }
+      for( int k = 0; k < numThreadsPerBlock; k++ )
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          sArray[k+1][l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
+      for( int k = 0; k < numThreadsPerBlock; k++ ) 
+        for( int l = 0; l < numThreadsPerBlock; l++ ){
+          changed1[ k*numThreadsPerBlock + l ] = 0;
+          /*changed2[ k*numThreadsPerBlock + l ] = 0;
+          changed3[ k*numThreadsPerBlock + l ] = 0;
+          changed4[ k*numThreadsPerBlock + l ] = 0;*/
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              changed1[ k*numThreadsPerBlock + l ] = this->updateCell( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
+        for( int l = 0; l < numThreadsPerBlock; l++ ) { 
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              /*changed2[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+      for( int k = 0; k < numThreadsPerBlock; k++ ) 
+        for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              /*changed3[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
+        for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              /*changed4[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
+        for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+          changed1[ 0 ] = changed1[ 0 ] || changed1[ k*numThreadsPerBlock + l ];
+          /*changed2[ 0 ] = changed2[ 0 ] || changed2[ k*numThreadsPerBlock + l ];
+          changed3[ 0 ] = changed3[ 0 ] || changed3[ k*numThreadsPerBlock + l ];
+          changed4[ 0 ] = changed4[ 0 ] || changed4[ k*numThreadsPerBlock + l ];*/
+        }
+      if( changed1[ 0 ] /*|| changed2[ 0 ] ||changed3[ 0 ] ||changed4[ 0 ]*/ )
+        BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1;
+      for( int k = 0; k < numThreadsPerBlock; k++ ){ 
+        for( int l = 0; l < numThreadsPerBlock; l++ ) {       
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY &&
+              (!interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ]) )
+            aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray[ k + 1 ][ l + 1 ];
+          //std::cout<< sArray[k+1][l+1];
+        }
+        //std::cout<<std::endl;
+      }
+    }
+  }
+}
+template< typename Real,
+          typename Device,
+          typename Index >
+void 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )
+{
+  int* BlockIterPom; 
+  BlockIterPom = new int [numBlockX * numBlockY];
+  for(int i = 0; i < numBlockX * numBlockY; i++)
+  {
+    BlockIterPom[ i ] = 0;  
+    if( BlockIterHost[ i ] )
+    {
+      // i = k*numBlockY + m;
+      int m=0, k=0;
+      m = i%numBlockX;
+      k = i/numBlockX;
+      if( k > 0 )
+        BlockIterPom[i - numBlockX] = 1;
+      if( k < numBlockY - 1 )
+        BlockIterPom[i + numBlockX] = 1;
+      if( m < numBlockX - 1 )
+        BlockIterPom[ i+1 ] = 1;
+      if( m > 0 )
+        BlockIterPom[ i-1 ] = 1;
+    }
+  }
+  for(int i = 0; i < numBlockX * numBlockY; i++ )
+      //if( !BlockIter[ i ] )
+        BlockIterHost[ i ] = BlockIterPom[ i ];
+      /*else
+        BlockIter[ i ] = 0;*/
+  /*for( int i = numBlockX-1; i > -1; i-- )
+  {
+      for( int j = 0; j< numBlockY; j++ )
+          std::cout << BlockIterHost[ i*numBlockY + j ];
+      std::cout << std::endl;
+  }
+  std::cout << std::endl;*/
+  delete[] BlockIterPom;
+}
 template< typename Real,
          typename Device,
          typename Index >

--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -88,7 +88,8 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
      using typename BaseType::InterfaceMapType;
      using typename BaseType::MeshFunctionType;
      using typename BaseType::InterfaceMapPointer;
-      using typename BaseType::MeshFunctionPointer;      
+      using typename BaseType::MeshFunctionPointer;
+      using typename BaseType::ArrayContainer;
      FastSweepingMethod();

--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -258,13 +258,21 @@ solve( const MeshPointer& mesh,
          tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
-          int *BlockIterDevice;
          int BlockIterD = 1;
-          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );
+          TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
+          BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ );
+          BlockIterDevice.setValue( 1 );
+          /*int *BlockIterDevice;
+          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/
          int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
-          int *dBlock;
-          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
+          TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
+          dBlock.setSize( nBlocks );
+          dBlock.setValue( 0 );
+          /*int *dBlock;
+          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
          while( BlockIterD )
          {
@@ -272,17 +280,24 @@ solve( const MeshPointer& mesh,
                                                              interfaceMapPtr.template getData< Device >(),
                                                              auxPtr.template modifyData< Device>(),
                                                              BlockIterDevice );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
-            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
          }
-          cudaFree( BlockIterDevice );
+          //cudaFree( BlockIterDevice );
-          cudaFree( dBlock );
+          //cudaFree( dBlock );
          cudaDeviceSynchronize();
          TNL_CHECK_CUDA_DEVICE;
          aux = *auxPtr;
@@ -302,7 +317,7 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice )
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
 {
    int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
    int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;