From 223c69a75ed82b7a8adabb8713f97ee716fcd3ae Mon Sep 17 00:00:00 2001
From: fencl <fenclmat@fjfi.cvut.cz>
Date: Thu, 23 Aug 2018 11:10:16 +0200
Subject: [PATCH] Parallel version for CUDA of FSM is implemented.

---
 .../tnlDirectEikonalMethodsBase.h             |  14 +-
 .../tnlDirectEikonalMethodsBase_impl.h        |  74 ++++++
 .../tnlFastSweepingMethod2D_impl.h            |  73 ++++--
 .../tnlFastSweepingMethod3D_impl.h            | 237 ++++++++++++++++--
 4 files changed, 359 insertions(+), 39 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index 74393f1fd6..a406a5b177 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -101,6 +101,10 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
       __cuda_callable__ void updateCell( MeshFunctionType& u,
                                          const MeshEntity& cell,
                                          const RealType velocity = 1.0);
+      
+      __cuda_callable__ bool updateCell( volatile Real sArray[10][10][10],
+                                         int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz,
+                                         const Real velocity = 1.0 );
 };
 
 template < typename T1, typename T2 >
@@ -125,10 +129,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
 template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice );
+                                      Real *aux,
+                                      int *BlockIterDevice);
 __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks );
 
+template < typename Real, typename Device, typename Index >
+__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );
+
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
                                 Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
@@ -142,7 +149,8 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
 template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux );
+                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+                                      int *BlockIterDevice );
 #endif
 
 #include "tnlDirectEikonalMethodsBase_impl.h"
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index cd4eb9e24b..31bc95df54 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -994,4 +994,78 @@ updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
     else
         return false;
 }
+
+template< typename Real,
+          typename Device,
+          typename Index >
+__cuda_callable__ 
+bool 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
+        const Real hx, const Real hy, const Real hz, const Real v )
+{
+   const RealType value = sArray[thrk][thrj][thri];
+   //std::cout << value << std::endl;
+   RealType a, b, c, tmp = TypeInfo< RealType >::getMaxValue();
+   
+   c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ],
+                        sArray[ thrk-1 ][ thrj ][ thri ] );
+    
+   b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ],
+                        sArray[ thrk ][ thrj-1 ][ thri ] );
+   
+   a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ],
+                        sArray[ thrk ][ thrj ][ thri-1 ] );
+   
+   
+   if( fabs( a ) == TypeInfo< RealType >::getMaxValue() && 
+       fabs( b ) == TypeInfo< RealType >::getMaxValue() &&
+       fabs( c ) == TypeInfo< RealType >::getMaxValue() )
+      return false;
+   
+    RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
+    
+    sortMinims( pom );
+    
+    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
+    if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+    {
+        sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
+        tmp = value - sArray[ thrk ][ thrj ][ thri ];
+        if ( fabs( tmp ) >  0.01*hx )
+            return true;
+        else
+            return false;
+    }
+    else
+    {
+        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+        if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
+        {
+            sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
+            tmp = value - sArray[ thrk ][ thrj ][ thri ];
+            if ( fabs( tmp ) > 0.01*hx )
+                return true;
+            else
+                return false;
+        }
+        else
+        {
+            tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+                TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+                hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+                hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+            sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
+            tmp = value - sArray[ thrk ][ thrj ][ thri ];
+            if ( fabs( tmp ) > 0.01*hx )
+                return true;
+            else
+                return false;
+        }
+    }
+    
+    return false;
+}
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index efdb13da83..7d5f8cbaa9 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -27,7 +27,7 @@ template< typename Real,
           typename Anisotropy >
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
 FastSweepingMethod()
-: maxIterations( 1 )
+: maxIterations( 100 )
 {
    
 }
@@ -236,6 +236,13 @@ solve( const MeshPointer& mesh,
       {
          // TODO: CUDA code
 #ifdef HAVE_CUDA
+          
+          Real *dAux;
+          cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) );
+          
+          
+          
+          
           const int cudaBlockSize( 16 );
           int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
           int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
@@ -244,7 +251,7 @@ solve( const MeshPointer& mesh,
           
           tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
           
-          
+          aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 );
           
           //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) );
 
@@ -263,7 +270,7 @@ solve( const MeshPointer& mesh,
                        
             CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
                                                              interfaceMapPtr.template getData< Device >(),
-                                                             auxPtr.template modifyData< Device>(),
+                                                             dAux,
                                                              BlockIterDevice );
             
             CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
@@ -275,14 +282,16 @@ solve( const MeshPointer& mesh,
                 BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
             
           }
+          aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 );
+          cudaFree( dAux );
           cudaFree( BlockIterDevice );
           cudaFree( dBlock );
           cudaDeviceSynchronize();
           
           TNL_CHECK_CUDA_DEVICE;
               
-          aux = *auxPtr;
-          interfaceMap = *interfaceMapPtr;
+          //aux = *auxPtr;
+          //interfaceMap = *interfaceMapPtr;
 #endif
       }
       iteration++;
@@ -291,6 +300,23 @@ solve( const MeshPointer& mesh,
 }
 
 #ifdef HAVE_CUDA
+template < typename Real, typename Device, typename Index >
+__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a )
+{
+    int i = threadIdx.x + blockDim.x*blockIdx.x;
+    int j = blockDim.y*blockIdx.y + threadIdx.y;
+    const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 )
+    {    
+        dAux[ j*mesh.getDimensions().x() + i ] = aux[ j*mesh.getDimensions().x() + i ];
+    }
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 )
+    {    
+        aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ];
+    }
+    
+}
+
 __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks )
 {
     int i = threadIdx.x;
@@ -341,16 +367,17 @@ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlock
 template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+                                      Real *aux,
                                       int *BlockIterDevice )
 {
     int thri = threadIdx.x; int thrj = threadIdx.y;
     int blIdx = blockIdx.x; int blIdy = blockIdx.y;
     int i = thri + blockDim.x*blIdx;
     int j = blockDim.y*blIdy + thrj;
-    int currentIndex = thrj * 16 + thri;
+    int currentIndex = thrj * blockDim.x + thri;
     
-    __shared__ volatile bool changed[256];
+    //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
+    __shared__ volatile bool changed[16*16];
     changed[ currentIndex ] = false;
     
     if( thrj == 0 && thri == 0 )
@@ -365,7 +392,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         hy = mesh.getSpaceSteps().y();
     }
     
-    __shared__ volatile Real sArray[ 18 ][ 18 ];
+    //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
+    __shared__ volatile Real sArray[18][18];
     sArray[thrj][thri] = TypeInfo< Real >::getMaxValue();
     
     //filling sArray edges
@@ -440,23 +468,28 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         {
             if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
             {
-                changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx, hy );
+                changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
             }
         }
         __syncthreads();
         
-        /*if( thri == 0 && thrj == 0 && blIdx == 1 && blIdy == 3 ){
-            for( int h = 15; h > -1; h-- ){
-                for( int g = 0; g < 16; g++ ){
-                    printf( "%d\t", changed[h*16+g] );
-                }
-                printf("\n");
+    //pyramid reduction
+        if( blockDim.x*blockDim.y == 1024 )
+        {
+            if( currentIndex < 512 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
             }
-            printf("\n");
         }
-        __syncthreads();*/
-        
-    //pyramid reduction
+        __syncthreads();
+        if( blockDim.x*blockDim.y >= 512 )
+        {
+            if( currentIndex < 256 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
+            }
+        }
+        __syncthreads();
         if( blockDim.x*blockDim.y >= 256 )
         {
             if( currentIndex < 128 )
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index e9fa72e666..c20d9a2faa 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -255,13 +255,34 @@ solve( const MeshPointer& mesh,
               std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
           dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
           dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
-          Devices::Cuda::synchronizeDevice();
-          
+                 
           tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
-          for( int k = 0; k < numBlocksX; k++)
-          CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
-                                                                                  interfaceMapPtr.template getData< Device >(),
-                                                                                  auxPtr.template modifyData< Device>() );
+          
+          int *BlockIterDevice;
+          int BlockIterD = 1;
+          
+          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );
+          int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
+          int *dBlock;
+          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
+          
+          while( BlockIterD )
+          {
+             CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
+                                                              interfaceMapPtr.template getData< Device >(),
+                                                              auxPtr.template modifyData< Device>(),
+                                                              BlockIterDevice );
+            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
+            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            
+            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+                                   
+            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
+                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
+            
+          }
+          cudaFree( BlockIterDevice );
+          cudaFree( dBlock );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
           aux = *auxPtr;
@@ -280,27 +301,211 @@ solve( const MeshPointer& mesh,
 template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux )
+                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+                                      int *BlockIterDevice )
 {
+    int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
+    int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
     int i = threadIdx.x + blockDim.x*blockIdx.x;
     int j = blockDim.y*blockIdx.y + threadIdx.y;
     int k = blockDim.z*blockIdx.z + threadIdx.z;
+    int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
+    
+    __shared__ volatile bool changed[8*8*8];
+    changed[ currentIndex ] = false;
+    
+    if( thrj == 0 && thri == 0 && thrk == 0 )
+        changed[ 0 ] = true;
+    
     const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+    __shared__ Real hx;
+    __shared__ Real hy;
+    __shared__ Real hz;
+    if( thrj == 1 && thri == 1 && thrk == 1 )
+    {
+        hx = mesh.getSpaceSteps().x();
+        hy = mesh.getSpaceSteps().y();
+        hz = mesh.getSpaceSteps().z();
+    }
+    __shared__ volatile Real sArray[10][10][10];
+    sArray[thrk][thrj][thri] = TypeInfo< Real >::getMaxValue();
+    if(thri == 0 )
+    {
+        sArray[8][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue();
+        sArray[9][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue();
+        sArray[thrk+1][thrj+1][8] = TypeInfo< Real >::getMaxValue();
+        sArray[thrk+1][thrj+1][9] = TypeInfo< Real >::getMaxValue();
+        sArray[thrj+1][8][thrk+1] = TypeInfo< Real >::getMaxValue();
+        sArray[thrj+1][9][thrk+1] = TypeInfo< Real >::getMaxValue();
+    }
+            
+    //filling sArray edges
+    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+    int dimZ = mesh.getDimensions().z();
+    __shared__ volatile int numOfBlockx;
+    __shared__ volatile int numOfBlocky;
+    __shared__ volatile int numOfBlockz;
+    __shared__ int xkolik;
+    __shared__ int ykolik;
+    __shared__ int zkolik;
+    if( thri == 0 && thrj == 0 && thrk == 0 )
+    {
+        xkolik = blockDim.x + 1;
+        ykolik = blockDim.y + 1;
+        zkolik = blockDim.z + 1;
+        numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
+        numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
+        numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0);
+        
+        if( numOfBlockx - 1 == blIdx )
+            xkolik = dimX - (blIdx)*blockDim.x+1;
+
+        if( numOfBlocky -1 == blIdy )
+            ykolik = dimY - (blIdy)*blockDim.y+1;
+        if( numOfBlockz-1 == blIdz )
+            zkolik = dimZ - (blIdz)*blockDim.z+1;
+        
+        BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0;
+    }
+    __syncthreads();
+    
+    if( thri == 0 )
+    {        
+        if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
+        else
+            sArray[thrk+1][thrj+1][0] = TypeInfo< Real >::getMaxValue();
+    }
+    
+    if( thri == 1 )
+    {
+        if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
+        else
+            sArray[thrk+1][thrj+1][9] = TypeInfo< Real >::getMaxValue();
+    }
+    if( thri == 2 )
+    {        
+        if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
+        else
+            sArray[thrk+1][0][thrj+1] = TypeInfo< Real >::getMaxValue();
+    }
+    
+    if( thri == 3 )
+    {
+        if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
+        else
+            sArray[thrk+1][9][thrj+1] = TypeInfo< Real >::getMaxValue();
+    }
+    if( thri == 4 )
+    {        
+        if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
+        else
+            sArray[0][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue();
+    }
+    
+    if( thri == 5 )
+    {
+        if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
+        else
+            sArray[9][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue();
+    }
     
     if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
     {
-        typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
-        Cell cell( mesh );
-        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
-        cell.refresh();
-        //tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
-        for( int l = 0; l < 10; l++ )
+        sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
+    }
+    __shared__ volatile int loopcounter;
+    loopcounter = 0;
+    __syncthreads(); 
+    while( changed[ 0 ] )
+    {
+        __syncthreads();
+        
+        changed[ currentIndex ] = false;
+        
+    //calculation of update cell
+        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ )
         {
-            if( ! interfaceMap( cell ) )
+            if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] )
             {
-               ptr.updateCell( aux, cell );
+                changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
             }
         }
+        __syncthreads();
+        
+    //pyramid reduction
+        if( blockDim.x*blockDim.y*blockDim.z == 1024 )
+        {
+            if( currentIndex < 512 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
+            }
+        }
+        __syncthreads();
+        if( blockDim.x*blockDim.y*blockDim.z >= 512 )
+        {
+            if( currentIndex < 256 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
+            }
+        }
+        __syncthreads();
+        if( blockDim.x*blockDim.y*blockDim.z >= 256 )
+        {
+            if( currentIndex < 128 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
+            }
+        }
+        __syncthreads();
+        if( blockDim.x*blockDim.y*blockDim.z >= 128 )
+        {
+            if( currentIndex < 64 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+            }
+        }
+        __syncthreads();
+        if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+        {
+            if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+            if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+            if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+            if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+            if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+            if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+        }
+        __syncthreads();
+        
+        /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0)
+        {
+            for(int m = 0; m < 8; m++){
+                for(int n = 0; n<8; n++){
+                    for(int b=0; b<8; b++)
+                        printf(" %i ", changed[m*64 + n*8 + b]);
+                    printf("\n");
+                }
+                printf("\n \n");
+            }
+        }*/
+        if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
+        {
+            //loopcounter++;
+            BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1;
+        }
+        __syncthreads();
+        /*if(thri == 0 && thrj==0 && thrk==0)
+            printf("%i \n",loopcounter);
+        if(loopcounter == 500)
+            break;*/
     }
-}
+  
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ k*dimX*dimY+j * mesh.getDimensions().x() + i ]) )
+        aux[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
+}   
 #endif
-- 
GitLab