From 8a8b083e31f59bce56fefdff27375b7c04027ab8 Mon Sep 17 00:00:00 2001
From: fencl <fenclmat@fjfi.cvut.cz>
Date: Thu, 17 May 2018 08:54:40 +0200
Subject: [PATCH] pyramid reduction in 2D with loopCounter

---
 .../tnlDirectEikonalMethodsBase.h             |   5 +-
 .../tnlDirectEikonalMethodsBase_impl.h        | 141 ++------------
 .../tnlFastSweepingMethod2D_impl.h            | 184 +++++++++++++++---
 3 files changed, 177 insertions(+), 153 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index 69fce1467f..979c70e7b5 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -69,7 +69,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
                                          const MeshEntity& cell,
                                          const RealType velocity = 1.0 );
       
-      __cuda_callable__ void updateCell( Real sArray[18][18],
+      __cuda_callable__ bool updateCell( volatile Real sArray[18][18],
                                          int thri, int thrj, const Real hx, const Real hy,
                                          const Real velocity = 1.0 );
       
@@ -116,7 +116,8 @@ __cuda_callable__ void sortMinims( T1 pom[] );
 template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux );
+                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+                                      bool *BlockIterDevice );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index e52f2fe1c6..a3af357c94 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -684,124 +684,9 @@ setsArray( MeshFunctionType& aux, Real sArray[18][18], int dimX, int dimY, int b
         {
             sArray[0][i] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + 0*dimX + i ];
         }
-    }
-    
-    /*int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0);
-    int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0);
-    int xkolik = 18;
-    int ykolik = 18;
-
-    if( numOfBlockx - 1 == blockIdx )
-        xkolik = dimX - (numOfBlockx-1)*16+1;
-
-    if( numOfBlocky -1 == blockIdy )
-        ykolik = dimY - (numOfBlocky-1)*16+1;
-
-
-    if( numOfBlockx == 0 || numOfBlocky == 0 )
-        return;
-    else
-    {
-        if( blockIdx == 0 )
-        {
-            if( blockIdy == 0 )
-            {
-                for( int j = 0; j < ykolik-1; j++ )
-                  for( int i = 0; i < xkolik-1; i++ )
-                    {
-                      sArray[ j+1 ][ i+1 ] = aux[ j*dimX + i ];
-                    }
-
-                //vypis( *sArray );
-            }
-            else
-            {
-                for( int j = 0; j < ykolik; j++ )
-                  for( int i = 0; i < xkolik-1; i++ )
-                    {
-                      sArray[ j ][ i+1 ] = aux[ blockIdy*16*dimX - dimX + j*dimX + i ];
-                    }
-            }
-        }
-        else if( blockIdy == 0 )
-        {
-            for( int j = 0; j < ykolik-1; j++ )
-              for( int i = 0; i < xkolik; i++ )
-                 {
-                   sArray[ j+1 ][ i ] = aux[ blockIdx*16 - 1 + j*dimX + i ];
-                 }
-        }
-        else
-        {
-           for( int j = 0; j < ykolik; j++ )
-              for( int i = 0; i < xkolik; i++ )
-                {
-                  //if( dimX*dimY-1 >  blockIdy*16*dimX - dimX + blockIdx*16 - 1 + j*dimX + i )
-                    sArray[ j ][ i ] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + j*dimX + i ];
-                }
-        }
-    }*/
+    }   
 }
 
-/*template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__ void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-getsArray( MeshFunctionType& aux, Real sArray[18][18], int dimX, int dimY, int blockIdx, int blockIdy )
-{
-    int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0);
-    int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0);
-    int xkolik = 18;
-    int ykolik = 18;
-    
-    if( numOfBlockx - 1 == blockIdx )
-        xkolik = dimX - (numOfBlockx-1)*16+2;
-
-    if( numOfBlocky -1 == blockIdy )
-        ykolik = dimY - (numOfBlocky-1)*16+2;
-
-    if( numOfBlockx == 0 || numOfBlocky == 0 )
-        return;
-    else
-    {
-        if( blockIdx == 0 )
-        {
-            if( blockIdy == 0 )
-            {
-               for( int j = 0; j < ykolik-2; j++ )
-                  for( int i = 0; i < xkolik-2; i++ )
-                    {
-                      aux[ j*dimX + i ] = sArray[ j+1 ][ i+1 ];
-                    }
-            }
-            else
-            {
-               for( int j = 1; j < ykolik-1; j++ )
-                  for( int i = 0; i < xkolik-2; i++ )
-                    {
-                      aux[ blockIdy*16*dimX - dimX + j*dimX + i ] = sArray[ j ][ i+1 ];
-                    }
-            }
-        }
-        else if( blockIdy == 0 )
-        {
-            for( int j = 0; j < ykolik-2; j++ )
-               for( int i = 1; i < xkolik-1; i++ )
-                 {
-                   aux[ blockIdx*16 - 1 + j*dimX + i ] = sArray[ j+1 ][ i ];
-                 }
-        }
-        else
-        {
-            for( int j = 1; j < ykolik-1; j++ )    
-              for( int i = 1; i < xkolik-1; i++ )
-                {
-                    aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + j*dimX + i ] = sArray[ j ][ i ];
-                }
-        }
-    }
-}*/
 
 
 #ifdef HAVE_CUDA
@@ -974,9 +859,9 @@ template< typename Real,
           typename Device,
           typename Index >
 __cuda_callable__
-void
+bool
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-updateCell( Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy,
+updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy,
             const Real v )
 {
    
@@ -991,39 +876,39 @@ updateCell( Real sArray[18][18], int thri, int thrj, const Real hx, const Real h
    const RealType& hy = mesh.getSpaceSteps().y();*/
    const RealType value = sArray[ thrj ][ thri ];//u( cell );
    RealType a, b, tmp = TypeInfo< RealType >::getMaxValue();
-   
-   /*if( value != u[ cell.getIndex() ] )
-       return;*/
-   
+      
    b = TNL::argAbsMin( sArray[ thrj+1 ][ thri ],
                         sArray[ thrj-1 ][ thri ] );
     
    a = TNL::argAbsMin( sArray[ thrj ][ thri+1 ],
                         sArray[ thrj ][ thri-1 ] );
 
-
-    
     if( fabs( a ) == TypeInfo< RealType >::getMaxValue() && 
         fabs( b ) == TypeInfo< RealType >::getMaxValue() )
-       return;
+       return false;
    
-    RealType pom[6] = { a, b, TypeInfo< Real >::getMaxValue(), (RealType)hx, (RealType)hy, 0.0 };
+    RealType pom[6] = { a, b, TypeInfo< Real >::getMaxValue(), (RealType)hx, (RealType)hy, TypeInfo< Real >::getMaxValue() };
     sortMinims( pom );
     tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
     
                                 
     if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
     {
-        //u[ cell.getIndex() ]= argAbsMin( value, tmp );
         sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
+        if ( fabs( value - sArray[ thrj ][ thri ] ) >  0.001 )
+            return true;
+        return false;
     }
     else
     {
         tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
             TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
             ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-        //u[ cell.getIndex() ]= argAbsMin( value, tmp );
         sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
+        if ( fabs( value - sArray[ thrj ][ thri ] ) > 0.001 )
+            return true;
+        return false;
     }
+    return false;
 }
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 1279e3437f..d4cb1f63be 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -227,12 +227,47 @@ solve( const MeshPointer& mesh,
           tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
           int nBlockIter = numBlocksX > numBlocksY ? numBlocksX : numBlocksY;
           nBlockIter = numBlocksX == numBlocksY ? nBlockIter + 1 : nBlockIter;
-          for( int k = 0; k < nBlockIter; k++)
+          
+          bool isNotDone = true;
+          bool* BlockIter = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) );
+          
+          bool *BlockIterDevice;
+          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ) );
+          
+          while( isNotDone )
+          {
+           for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+                BlockIter[ i ] = false;   
+           
+            isNotDone = false;
+            
             CudaUpdateCellCaller<<< gridSize, blockSize, 18 * 18 * sizeof( Real ) >>>( ptr,
                                                                                     interfaceMapPtr.template getData< Device >(),
-                                                                                    auxPtr.template modifyData< Device>() );
+                                                                                    auxPtr.template modifyData< Device>(),
+                                                                                    BlockIterDevice );
+            cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ), cudaMemcpyDeviceToHost);
+            for( int i = 0; i < numBlocksX; i++ )
+            {    for( int j = 0; j < numBlocksY; j++ )
+                {
+                    if( BlockIter[ j * numBlocksY + i ] )
+                        std::cout << "true." << "\t";
+                    else
+                        std::cout << "false." << "\t";    
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+             
+            for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+                isNotDone = isNotDone || BlockIter[ i ];
+            
+          }
+          delete[] BlockIter;
+          cudaFree( BlockIterDevice );
           cudaDeviceSynchronize();
+          
           TNL_CHECK_CUDA_DEVICE;
+              
           aux = *auxPtr;
           interfaceMap = *interfaceMapPtr;
 #endif
@@ -246,46 +281,149 @@ solve( const MeshPointer& mesh,
 template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux )
+                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+                                      bool *BlockIterDevice )
 {
     int thri = threadIdx.x; int thrj = threadIdx.y; // nelze ke stejnym pristupovat znovu pres threadIdx (vzdy da jine hodnoty)
     int blIdx = blockIdx.x; int blIdy = blockIdx.y;
     int i = thri + blockDim.x*blIdx;
     int j = blockDim.y*blIdy + thrj;
+    
+    __shared__ volatile bool changed[256];
+    changed[ thrj * blockDim.x + thri ] = false;
+     __shared__ volatile bool SmallCycleBoolin;
+    if( thrj == 0 && thri == 0 )
+        SmallCycleBoolin = true;
+    
+    
     const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
     const Real hx = mesh.getSpaceSteps().x();
     const Real hy = mesh.getSpaceSteps().y();
     
-    __shared__ Real sArray[ 18 ][ 18 ];
-    for( int k = 0; k < 18; k++ )
-        for( int l = 0; l < 18; l++ )
-            sArray[ k ][ l ] = TypeInfo< Real >::getMaxValue();
-    __syncthreads();
-    /*//filling shared array
-    ptr.setsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy );
-    __syncthreads();*/
+    __shared__ volatile Real sArray[ 18 ][ 18 ];
+    sArray[thrj][thri] = TypeInfo< Real >::getMaxValue();
     
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+    //filling sArray edges
+    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+    int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0);
+    int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0);
+    int xkolik = 17;
+    int ykolik = 17;
+    
+    
+    if( numOfBlockx - 1 == blIdx )
+        xkolik = dimX - (blIdx)*16+1;
+
+    if( numOfBlocky -1 == blIdy )
+        ykolik = dimY - (blIdy)*16+1;
+       
+    
+    if( thri == 0 )
+    {        
+        if( dimX > (blIdx+1) * 16  && thrj+1 < ykolik )
+            sArray[thrj+1][xkolik] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + (thrj+1)*dimX + 17 ];
+        else
+            sArray[thrj+1][xkolik] = TypeInfo< Real >::getMaxValue();
+    }
+    
+    if( thri == 1 )
+    {
+        if( blIdx != 0 && thrj+1 < ykolik )
+            sArray[thrj+1][0] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + (thrj+1)*dimX + 0];
+        else
+            sArray[thrj+1][0] = TypeInfo< Real >::getMaxValue();
+    }
+    
+    if( thrj == 0 )
     {
+        if( dimY > (blIdy+1) * 16  && thri+1 < xkolik )
+            sArray[ykolik][thri+1] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + 17*dimX + thri+1 ];
+        else
+           sArray[ykolik][thri+1] = TypeInfo< Real >::getMaxValue();
+    }
+    
+    if( thrj == 1 )
+    {
+        if( blIdy != 0 && thri+1 < xkolik )
+            sArray[0][thri+1] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + 0*dimX + thri+1 ];
+        else
+            sArray[0][thri+1] = TypeInfo< Real >::getMaxValue();
+    }
+    
+    //filling BlockIterDevice
+    BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = false;
+    
+    
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+    {    
         sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
-        ptr.setsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy ); //fill edges of sArray
         __syncthreads();
-            
+    }    
+    
+    __shared__ int loopcounter;
+    if( thri == 0 && thrj == 0 )
+        loopcounter= 0;
+  //if( blIdx == 5 && blIdy == 4 )
+    while( SmallCycleBoolin )   
+    {
+        if( thri == 1 && thrj == 1 )
+            SmallCycleBoolin = false;
+        
+        changed[ thrj * 16 + thri ] = false;
+        __syncthreads();
         
-        if( ! interfaceMap[ j*mesh.getDimensions().x() + i ] ) 
+    //calculation of update cell
+        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+        { 
+            if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
+            {
+                changed[ thrj * 16 + thri ] = ptr.updateCell( sArray, thri+1, thrj+1, hx, hy );
+                __syncthreads();   
+            }
+        }
+        
+    //pyramid reduction
+        if( blockDim.x*blockDim.y >= 256 )
+        {
+            if( (thrj * 16 + thri) < 128 )
+            {
+                changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 128 ];
+            }
+        __syncthreads();
+        }
+        if( blockDim.x*blockDim.y >= 128 )
         {
-            for( int k = 0; k < 17; k++ )
+            if( (thrj * 16 + thri) < 64 )
             {
-                ptr.updateCell( sArray, thri+1, thrj+1, hx, hy );
-                __syncthreads();
+                changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 64 ];
             }
+        __syncthreads();
+        }
+        if( (thrj * 16 + thri) < 32 )
+        {
+            changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 32 ];
+            changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 16 ];
+            changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 8 ];
+            changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 4 ];
+            changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 2 ];
+            changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 1 ];
+        }
+        __syncthreads();
+
+        if( thrj == 1 && thri == 1 )
+        {
+            loopcounter++;
+            if( loopcounter > 1000 )
+                break;
+            SmallCycleBoolin = changed[ 0 ];
+            //if( SmallCycleBoolin )
+                BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = SmallCycleBoolin;
         }
-        /*for( int k = 0; k < mesh.getDimensions().x(); k++)
-            for( int l = 0; l < mesh.getDimensions().y(); l++)
-                aux[ k*mesh.getDimensions().x() + l ] = TypeInfo< Real >::getMaxValue();*/
-        aux[j*mesh.getDimensions().x() + i] = sArray[thrj+1][thri+1];
         __syncthreads();
     }
-    //ptr.getsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy );
+        
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+        aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
+        
 }
 #endif
-- 
GitLab